From 49a92dc311514ea5b17685f2211890d047a6a842 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 20 Mar 2026 15:28:48 +0800
Subject: [PATCH 1/2] issue/1090: qy flash-attention

---
 include/infinicore/adaptor/aten_adaptor.hpp |   7 +-
 src/infinicore/adaptor/aten_adaptor.cc      |   2 -
 src/infiniop/ops/avg_pool1d/cuda/kernel.cuh |   2 +-
 src/infiniop/ops/fmod/operator.cc           |   3 -
 xmake.lua                                   |   4 +
 xmake/qy.lua                                | 114 +++++++++++++++++++-
 6 files changed, 120 insertions(+), 12 deletions(-)
diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
index 70cb98e18..14711ca1c 100644
--- a/include/infinicore/adaptor/aten_adaptor.hpp
+++ b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -5,10 +5,8 @@
 
 #include <ATen/ATen.h>
 
-#ifdef ENABLE_NVIDIA_API
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#endif
 
 namespace infinicore::adaptor {
 inline at::ScalarType to_at_dtype(DataType dtype) {
@@ -33,6 +31,8 @@ inline at::Device to_at_device(const Device &device) {
         return at::Device(at::kCUDA, device.getIndex());
     } else if (device.getType() == Device::Type::CPU) {
         return at::Device(at::kCPU);
+    } else if (device.getType() == Device::Type::QY) {
+        return at::Device(at::kCUDA, device.getIndex());
     } else {
         throw std::runtime_error("Unsupported device type for ATen");
     }
@@ -40,9 +40,8 @@ inline at::Device to_at_device(const Device &device) {
 
 at::Tensor to_aten_tensor(const infinicore::Tensor &t);
 
-#ifdef ENABLE_NVIDIA_API
 c10::cuda::CUDAStream get_cuda_stream();
-#endif
+
 } // namespace infinicore::adaptor
 
 #endif // ENABLE_ATEN
diff --git a/src/infinicore/adaptor/aten_adaptor.cc b/src/infinicore/adaptor/aten_adaptor.cc
index 2edbe3f8f..9488884d3 100644
--- a/src/infinicore/adaptor/aten_adaptor.cc
+++ b/src/infinicore/adaptor/aten_adaptor.cc
@@ -32,12 +32,10 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {
         options);
 }
 
-#ifdef ENABLE_NVIDIA_API
 c10::cuda::CUDAStream get_cuda_stream() {
     return c10::cuda::getStreamFromExternal(
         cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());
 }
-#endif
 
 } // namespace infinicore::adaptor
 
diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
index 3e06b9067..d052ef97f 100644
--- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
@@ -46,7 +46,7 @@ __device__ void avgPool1dKernel(
             }
         }
 
-#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API)
         // Iluvatar __half doesn't accept size_t directly.
         y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
 #else
diff --git a/src/infiniop/ops/fmod/operator.cc b/src/infiniop/ops/fmod/operator.cc
index 482926390..f46dcfae4 100644
--- a/src/infiniop/ops/fmod/operator.cc
+++ b/src/infiniop/ops/fmod/operator.cc
@@ -136,9 +136,6 @@ __INFINI_C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t
 #ifdef ENABLE_ILUVATAR_API
         GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, metax);
 #endif
diff --git a/xmake.lua b/xmake.lua
index 4200da75b..822d84531 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -467,6 +467,10 @@ target("infinicore_cpp_api")
         if has_config("nv-gpu") then
             add_deps("flash-attn-nvidia")
         end
+        if has_config("qy-gpu") then
+            add_deps("flash-attn-qy")
+            add_files("build/.objs/flash-attn-qy/rules/qy.cuda/__/__/flash-attention-dl-v2.7.4.post1-19/csrc/flash_attn/src/*.cu.o", {public = true})
+        end
     end
 
     before_build(function (target)
diff --git a/xmake/qy.lua b/xmake/qy.lua
index 810f88c2f..77469765f 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -3,6 +3,16 @@ if CUDNN_ROOT ~= nil then
     add_includedirs(CUDNN_ROOT .. "/include")
 end
 
+local CUTLASS_ROOT = os.getenv("CUTLASS_ROOT") or os.getenv("CUTLASS_HOME") or os.getenv("CUTLASS_PATH")
+
+if CUTLASS_ROOT ~= nil then
+    add_includedirs(CUTLASS_ROOT)
+end
+
+local FLASH_ATTN_ROOT = get_config("flash-attn")
+
+local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
 add_includedirs("/usr/local/denglin/sdk/include", "../include")
 add_linkdirs("/usr/local/denglin/sdk/lib")
 add_links("curt", "cublas", "cudnn")
@@ -44,8 +54,20 @@ rule("qy.cuda")
         local sdk_path = "/usr/local/denglin/sdk"
         local arch = "dlgput64"
 
-        local relpath = path.relative(sourcefile, project.directory())
-        local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o")
+        
+        local relpath = path.relative(sourcefile, os.projectdir())
+
+        -- 去掉 ..，转成安全路径
+        relpath = relpath:gsub("%.%.", "__")
+
+        local objfile = path.join(
+            config.buildir(),
+            ".objs",
+            target:name(),
+            "rules",
+            "qy.cuda",
+            relpath .. ".o"
+        )
 
         -- 🟢 强制注册 .o 文件给 target
         target:add("objectfiles", objfile)
@@ -153,3 +175,91 @@ target("infiniccl-qy")
     set_languages("cxx17")
 
 target_end()
+
+target("flash-attn-qy")
+    set_kind("shared")
+    set_default(false)
+
+    set_languages("cxx17")
+    add_cxxflags("-std=c++17")
+    add_cuflags("--std=c++17", {force = true})
+
+    -- 🔥 DLCC 规则
+    add_rules("qy.cuda", {override = true})
+
+    if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= false and FLASH_ATTN_ROOT ~= "" then
+
+        -- ⭐⭐⭐ 关键：用 on_load（不是 before_build）
+        on_load(function (target)
+
+            local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim()
+            local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim()
+            local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim()
+            local LIB_PYTHON = os.iorunv("python", {"-c", "import glob,sysconfig,os;print(glob.glob(os.path.join(sysconfig.get_config_var('LIBDIR'),'libpython*.so'))[0])"}):trim()
+
+            -- ✅ CUDA（最关键）
+            target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true})
+
+            -- ✅ flash-attn
+            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc")
+            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn")
+            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn/src")
+            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/common")
+
+            -- ✅ torch
+            target:add("includedirs", TORCH_DIR .. "/include")
+            target:add("includedirs", TORCH_DIR .. "/include/torch/csrc/api/include")
+
+            -- ⚠️ 很关键：ATen 有些头在这里
+            target:add("includedirs", TORCH_DIR .. "/include/TH")
+            target:add("includedirs", TORCH_DIR .. "/include/THC")
+
+            -- ✅ python
+            target:add("includedirs", PYTHON_INCLUDE)
+
+            -- ✅ cutlass
+            if CUTLASS_ROOT then
+                target:add("includedirs", CUTLASS_ROOT .. "/include")
+            end
+
+            -- link dirs
+            target:add("linkdirs", TORCH_DIR .. "/lib")
+            target:add("linkdirs", PYTHON_LIB_DIR)
+            target:add("linkdirs", "/usr/local/denglin/sdk/lib")
+
+            -- links
+            target:add("links",
+                "curt",
+                "cublas",
+                "cudnn",
+                "torch",
+                "torch_cpu",
+                "torch_cuda",
+                "c10",
+                "c10_cuda",
+                "torch_python",
+                LIB_PYTHON
+            )
+        end)
+
+        -- ✅ C++ host
+        add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/flash_api.cpp")
+
+        -- ✅ CUDA kernel
+        add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/src/*.cu")
+
+        -- flags
+        add_cxflags("-fPIC", {force = true})
+        add_cuflags("-O2", "-fPIC", "--expt-relaxed-constexpr", "--use_fast_math", {force = true})
+
+        add_ldflags("-Wl,--no-undefined", {force = true})
+
+    else
+        on_load(function ()
+            print("Flash Attention not available, skipping flash-attn-qy build")
+        end)
+    end
+
+    on_install(function (target) end)
+
+target_end()

From 7e5b8017c62ad9054c63b65bf1b6e76bdcec74a6 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 23 Mar 2026 11:22:45 +0800
Subject: [PATCH 2/2] issue/1090: success link flash-attention.so

---
 include/infinicore/adaptor/aten_adaptor.hpp |   2 +
 xmake.lua                                   |  15 ++-
 xmake/qy.lua                                | 116 +++++++-------------
 3 files changed, 55 insertions(+), 78 deletions(-)

diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
index 14711ca1c..8259ec6fc 100644
--- a/include/infinicore/adaptor/aten_adaptor.hpp
+++ b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -5,8 +5,10 @@
 
 #include <ATen/ATen.h>
 
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#endif
 
 namespace infinicore::adaptor {
 inline at::ScalarType to_at_dtype(DataType dtype) {
diff --git a/xmake.lua b/xmake.lua
index 822d84531..c803024d4 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -247,7 +247,6 @@ if has_config("aten") then
     end
 end
 
-
 -- cuda graph
 option("graph")
     set_default(false)
@@ -259,7 +258,6 @@ if has_config("graph") then
     add_defines("USE_INFINIRT_GRAPH")
 end
 
-
 -- InfiniCCL
 option("ccl")
     set_default(false)
@@ -473,6 +471,19 @@ target("infinicore_cpp_api")
         end
     end
 
+    if get_config("flash-attn") and get_config("flash-attn") ~= "" and has_config("qy-gpu") then
+        local flash_so_qy = _qy_flash_attn_cuda_so_path()
+        local flash_dir_qy = path.directory(flash_so_qy)
+        local flash_name_qy = path.filename(flash_so_qy)
+        before_link(function (target)
+            target:add(
+                "shflags",
+                "-Wl,--no-as-needed -L" .. flash_dir_qy .. " -l:" .. flash_name_qy .. " -Wl,-rpath," .. flash_dir_qy,
+                {force = true}
+            )
+        end)
+    end
+
     before_build(function (target)
         if has_config("aten") then
             local outdata = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim()
diff --git a/xmake/qy.lua b/xmake/qy.lua
index 77469765f..8421eb002 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -13,6 +13,35 @@ local FLASH_ATTN_ROOT = get_config("flash-attn")
 
 local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
 
+local FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT =
+    "/home/shangyouren/miniconda3/envs/xiaobase/lib/python3.12/site-packages/flash_attn_2_cuda.cpython-312-x86_64-linux-gnu.so"
+
+function _qy_flash_attn_cuda_so_path()
+    -- Highest priority: override the exact `.so` file to link.
+    local env_path = os.getenv("FLASH_ATTN_2_CUDA_SO")
+    if env_path and env_path ~= "" then
+        env_path = env_path:trim()
+        if not os.isfile(env_path) then
+            raise("qy+flash-attn: FLASH_ATTN_2_CUDA_SO is not a file: %s", env_path)
+        end
+        return env_path
+    end
+
+    -- Second priority: allow overriding the "expected" container path via env.
+    local container_path = os.getenv("FLASH_ATTN_QY_CUDA_SO_CONTAINER")
+    if not container_path or container_path == "" then
+        container_path = FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT
+    end
+
+    if not os.isfile(container_path) then
+        raise(
+            "qy+flash-attn: expected %s\n  Install flash-attn in the conda env, or export FLASH_ATTN_2_CUDA_SO to your .so path.",
+            container_path
+        )
+    end
+    return container_path
+end
+
 add_includedirs("/usr/local/denglin/sdk/include", "../include")
 add_linkdirs("/usr/local/denglin/sdk/lib")
 add_links("curt", "cublas", "cudnn")
@@ -177,89 +206,24 @@ target("infiniccl-qy")
 target_end()
 
 target("flash-attn-qy")
-    set_kind("shared")
+    set_kind("phony")
     set_default(false)
+    
 
-    set_languages("cxx17")
-    add_cxxflags("-std=c++17")
-    add_cuflags("--std=c++17", {force = true})
-
-    -- 🔥 DLCC 规则
-    add_rules("qy.cuda", {override = true})
-
-    if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= false and FLASH_ATTN_ROOT ~= "" then
-
-        -- ⭐⭐⭐ 关键：用 on_load（不是 before_build）
-        on_load(function (target)
-
+    if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= "" then
+        before_build(function (target)
+            target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true})
             local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim()
             local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim()
             local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim()
-            local LIB_PYTHON = os.iorunv("python", {"-c", "import glob,sysconfig,os;print(glob.glob(os.path.join(sysconfig.get_config_var('LIBDIR'),'libpython*.so'))[0])"}):trim()
-
-            -- ✅ CUDA（最关键）
-            target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true})
-
-            -- ✅ flash-attn
-            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc")
-            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn")
-            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn/src")
-            target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/common")
-
-            -- ✅ torch
-            target:add("includedirs", TORCH_DIR .. "/include")
-            target:add("includedirs", TORCH_DIR .. "/include/torch/csrc/api/include")
-
-            -- ⚠️ 很关键：ATen 有些头在这里
-            target:add("includedirs", TORCH_DIR .. "/include/TH")
-            target:add("includedirs", TORCH_DIR .. "/include/THC")
-
-            -- ✅ python
-            target:add("includedirs", PYTHON_INCLUDE)
-
-            -- ✅ cutlass
-            if CUTLASS_ROOT then
-                target:add("includedirs", CUTLASS_ROOT .. "/include")
-            end
-
-            -- link dirs
-            target:add("linkdirs", TORCH_DIR .. "/lib")
-            target:add("linkdirs", PYTHON_LIB_DIR)
-            target:add("linkdirs", "/usr/local/denglin/sdk/lib")
-
-            -- links
-            target:add("links",
-                "curt",
-                "cublas",
-                "cudnn",
-                "torch",
-                "torch_cpu",
-                "torch_cuda",
-                "c10",
-                "c10_cuda",
-                "torch_python",
-                LIB_PYTHON
-            )
+            
+            -- Validate build/runtime env in container and keep these paths available for downstream linking.
+            target:add("includedirs", TORCH_DIR .. "/include", TORCH_DIR .. "/include/torch/csrc/api/include", PYTHON_INCLUDE, {public = false})
+            target:add("linkdirs", TORCH_DIR .. "/lib", PYTHON_LIB_DIR, {public = false})
         end)
-
-        -- ✅ C++ host
-        add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/flash_api.cpp")
-
-        -- ✅ CUDA kernel
-        add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/src/*.cu")
-
-        -- flags
-        add_cxflags("-fPIC", {force = true})
-        add_cuflags("-O2", "-fPIC", "--expt-relaxed-constexpr", "--use_fast_math", {force = true})
-
-        add_ldflags("-Wl,--no-undefined", {force = true})
-
     else
-        on_load(function ()
-            print("Flash Attention not available, skipping flash-attn-qy build")
+        before_build(function (target)
+            print("Flash Attention not available, skipping flash-attn-qy integration")
         end)
     end
-
-    on_install(function (target) end)
-
 target_end()