diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
index 70cb98e18..8259ec6fc 100644
--- a/include/infinicore/adaptor/aten_adaptor.hpp
+++ b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -5,7 +5,7 @@
 
 #include <ATen/ATen.h>
 
-#ifdef ENABLE_NVIDIA_API
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #endif
@@ -33,6 +33,8 @@ inline at::Device to_at_device(const Device &device) {
         return at::Device(at::kCUDA, device.getIndex());
     } else if (device.getType() == Device::Type::CPU) {
         return at::Device(at::kCPU);
+    } else if (device.getType() == Device::Type::QY) {
+        return at::Device(at::kCUDA, device.getIndex());
     } else {
         throw std::runtime_error("Unsupported device type for ATen");
     }
@@ -40,9 +42,8 @@ inline at::Device to_at_device(const Device &device) {
 
 at::Tensor to_aten_tensor(const infinicore::Tensor &t);
 
-#ifdef ENABLE_NVIDIA_API
 c10::cuda::CUDAStream get_cuda_stream();
-#endif
+
 } // namespace infinicore::adaptor
 
 #endif // ENABLE_ATEN
diff --git a/src/infinicore/adaptor/aten_adaptor.cc b/src/infinicore/adaptor/aten_adaptor.cc
index 2edbe3f8f..9488884d3 100644
--- a/src/infinicore/adaptor/aten_adaptor.cc
+++ b/src/infinicore/adaptor/aten_adaptor.cc
@@ -32,12 +32,10 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) {
         options);
 }
 
-#ifdef ENABLE_NVIDIA_API
 c10::cuda::CUDAStream get_cuda_stream() {
     return c10::cuda::getStreamFromExternal(
         cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex());
 }
-#endif
 
 } // namespace infinicore::adaptor
 
diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
index 3e06b9067..d052ef97f 100644
--- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
@@ -46,7 +46,7 @@ __device__ void avgPool1dKernel(
             }
         }
 
-#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API)
         // Iluvatar __half doesn't accept size_t directly.
         y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
 #else
diff --git a/src/infiniop/ops/fmod/operator.cc b/src/infiniop/ops/fmod/operator.cc
index 482926390..f46dcfae4 100644
--- a/src/infiniop/ops/fmod/operator.cc
+++ b/src/infiniop/ops/fmod/operator.cc
@@ -136,9 +136,6 @@ __INFINI_C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t
 #ifdef ENABLE_ILUVATAR_API
         GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, metax);
 #endif
diff --git a/xmake.lua b/xmake.lua
index 4200da75b..c803024d4 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -247,7 +247,6 @@ if has_config("aten") then
     end
 end
 
-
 -- cuda graph
 option("graph")
     set_default(false)
@@ -259,7 +258,6 @@ if has_config("graph") then
     add_defines("USE_INFINIRT_GRAPH")
 end
 
-
 -- InfiniCCL
 option("ccl")
     set_default(false)
@@ -467,6 +465,23 @@ target("infinicore_cpp_api")
         if has_config("nv-gpu") then
             add_deps("flash-attn-nvidia")
         end
+        if has_config("qy-gpu") then
+            add_deps("flash-attn-qy")
+            add_files("build/.objs/flash-attn-qy/rules/qy.cuda/__/__/flash-attention-dl-v2.7.4.post1-19/csrc/flash_attn/src/*.cu.o", {public = true})
+        end
+    end
+
+    if get_config("flash-attn") and get_config("flash-attn") ~= "" and has_config("qy-gpu") then
+        local flash_so_qy = _qy_flash_attn_cuda_so_path()
+        local flash_dir_qy = path.directory(flash_so_qy)
+        local flash_name_qy = path.filename(flash_so_qy)
+        before_link(function (target)
+            target:add(
+                "shflags",
+                "-Wl,--no-as-needed -L" .. flash_dir_qy .. " -l:" .. flash_name_qy .. " -Wl,-rpath," .. flash_dir_qy,
+                {force = true}
+            )
+        end)
     end
 
     before_build(function (target)
diff --git a/xmake/qy.lua b/xmake/qy.lua
index 810f88c2f..8421eb002 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -3,6 +3,45 @@ if CUDNN_ROOT ~= nil then
     add_includedirs(CUDNN_ROOT .. "/include")
 end
 
+local CUTLASS_ROOT = os.getenv("CUTLASS_ROOT") or os.getenv("CUTLASS_HOME") or os.getenv("CUTLASS_PATH")
+
+if CUTLASS_ROOT ~= nil then
+    add_includedirs(CUTLASS_ROOT)
+end
+
+local FLASH_ATTN_ROOT = get_config("flash-attn")
+
+local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+local FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT =
+    "/home/shangyouren/miniconda3/envs/xiaobase/lib/python3.12/site-packages/flash_attn_2_cuda.cpython-312-x86_64-linux-gnu.so"
+
+function _qy_flash_attn_cuda_so_path()
+    -- Highest priority: override the exact `.so` file to link.
+    local env_path = os.getenv("FLASH_ATTN_2_CUDA_SO")
+    if env_path and env_path ~= "" then
+        env_path = env_path:trim()
+        if not os.isfile(env_path) then
+            raise("qy+flash-attn: FLASH_ATTN_2_CUDA_SO is not a file: %s", env_path)
+        end
+        return env_path
+    end
+
+    -- Second priority: allow overriding the "expected" container path via env.
+    local container_path = os.getenv("FLASH_ATTN_QY_CUDA_SO_CONTAINER")
+    if not container_path or container_path == "" then
+        container_path = FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT
+    end
+
+    if not os.isfile(container_path) then
+        raise(
+            "qy+flash-attn: expected %s\n  Install flash-attn in the conda env, or export FLASH_ATTN_2_CUDA_SO to your .so path.",
+            container_path
+        )
+    end
+    return container_path
+end
+
 add_includedirs("/usr/local/denglin/sdk/include", "../include")
 add_linkdirs("/usr/local/denglin/sdk/lib")
 add_links("curt", "cublas", "cudnn")
@@ -44,8 +83,20 @@ rule("qy.cuda")
         local sdk_path = "/usr/local/denglin/sdk"
         local arch = "dlgput64"
 
-        local relpath = path.relative(sourcefile, project.directory())
-        local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o")
+        
+        local relpath = path.relative(sourcefile, os.projectdir())
+
+        -- 去掉 ..，转成安全路径
+        relpath = relpath:gsub("%.%.", "__")
+
+        local objfile = path.join(
+            config.buildir(),
+            ".objs",
+            target:name(),
+            "rules",
+            "qy.cuda",
+            relpath .. ".o"
+        )
 
         -- 🟢 强制注册 .o 文件给 target
         target:add("objectfiles", objfile)
@@ -153,3 +204,26 @@ target("infiniccl-qy")
     set_languages("cxx17")
 
 target_end()
+
+target("flash-attn-qy")
+    set_kind("phony")
+    set_default(false)
+    
+
+    if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= "" then
+        before_build(function (target)
+            target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true})
+            local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim()
+            local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim()
+            local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim()
+            
+            -- Validate build/runtime env in container and keep these paths available for downstream linking.
+            target:add("includedirs", TORCH_DIR .. "/include", TORCH_DIR .. "/include/torch/csrc/api/include", PYTHON_INCLUDE, {public = false})
+            target:add("linkdirs", TORCH_DIR .. "/lib", PYTHON_LIB_DIR, {public = false})
+        end)
+    else
+        before_build(function (target)
+            print("Flash Attention not available, skipping flash-attn-qy integration")
+        end)
+    end
+target_end()