From 49a92dc311514ea5b17685f2211890d047a6a842 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 20 Mar 2026 15:28:48 +0800 Subject: [PATCH 1/2] issue/1090: qy flash-attention --- include/infinicore/adaptor/aten_adaptor.hpp | 7 +- src/infinicore/adaptor/aten_adaptor.cc | 2 - src/infiniop/ops/avg_pool1d/cuda/kernel.cuh | 2 +- src/infiniop/ops/fmod/operator.cc | 3 - xmake.lua | 4 + xmake/qy.lua | 114 +++++++++++++++++++- 6 files changed, 120 insertions(+), 12 deletions(-) diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp index 70cb98e18..14711ca1c 100644 --- a/include/infinicore/adaptor/aten_adaptor.hpp +++ b/include/infinicore/adaptor/aten_adaptor.hpp @@ -5,10 +5,8 @@ #include -#ifdef ENABLE_NVIDIA_API #include #include -#endif namespace infinicore::adaptor { inline at::ScalarType to_at_dtype(DataType dtype) { @@ -33,6 +31,8 @@ inline at::Device to_at_device(const Device &device) { return at::Device(at::kCUDA, device.getIndex()); } else if (device.getType() == Device::Type::CPU) { return at::Device(at::kCPU); + } else if (device.getType() == Device::Type::QY) { + return at::Device(at::kCUDA, device.getIndex()); } else { throw std::runtime_error("Unsupported device type for ATen"); } @@ -40,9 +40,8 @@ inline at::Device to_at_device(const Device &device) { at::Tensor to_aten_tensor(const infinicore::Tensor &t); -#ifdef ENABLE_NVIDIA_API c10::cuda::CUDAStream get_cuda_stream(); -#endif + } // namespace infinicore::adaptor #endif // ENABLE_ATEN diff --git a/src/infinicore/adaptor/aten_adaptor.cc b/src/infinicore/adaptor/aten_adaptor.cc index 2edbe3f8f..9488884d3 100644 --- a/src/infinicore/adaptor/aten_adaptor.cc +++ b/src/infinicore/adaptor/aten_adaptor.cc @@ -32,12 +32,10 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t) { options); } -#ifdef ENABLE_NVIDIA_API c10::cuda::CUDAStream get_cuda_stream() { return c10::cuda::getStreamFromExternal( cudaStream_t(infinicore::context::getStream()), infinicore::context::getDevice().getIndex()); } -#endif } // namespace infinicore::adaptor diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh index 3e06b9067..d052ef97f 100644 --- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh +++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh @@ -46,7 +46,7 @@ __device__ void avgPool1dKernel( } } -#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) // Iluvatar __half doesn't accept size_t directly. y[y_offset] = sum / static_cast(static_cast(kernel_size)); #else diff --git a/src/infiniop/ops/fmod/operator.cc b/src/infiniop/ops/fmod/operator.cc index 482926390..f46dcfae4 100644 --- a/src/infiniop/ops/fmod/operator.cc +++ b/src/infiniop/ops/fmod/operator.cc @@ -136,9 +136,6 @@ __INFINI_C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif diff --git a/xmake.lua b/xmake.lua index 4200da75b..822d84531 100644 --- a/xmake.lua +++ b/xmake.lua @@ -467,6 +467,10 @@ target("infinicore_cpp_api") if has_config("nv-gpu") then add_deps("flash-attn-nvidia") end + if has_config("qy-gpu") then + add_deps("flash-attn-qy") + add_files("build/.objs/flash-attn-qy/rules/qy.cuda/__/__/flash-attention-dl-v2.7.4.post1-19/csrc/flash_attn/src/*.cu.o", {public = true}) + end end before_build(function (target) diff --git a/xmake/qy.lua b/xmake/qy.lua index 810f88c2f..77469765f 100644 --- a/xmake/qy.lua +++ b/xmake/qy.lua @@ -3,6 +3,16 @@ if CUDNN_ROOT ~= nil then add_includedirs(CUDNN_ROOT .. "/include") end +local CUTLASS_ROOT = os.getenv("CUTLASS_ROOT") or os.getenv("CUTLASS_HOME") or os.getenv("CUTLASS_PATH") + +if CUTLASS_ROOT ~= nil then + add_includedirs(CUTLASS_ROOT) +end + +local FLASH_ATTN_ROOT = get_config("flash-attn") + +local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") + add_includedirs("/usr/local/denglin/sdk/include", "../include") add_linkdirs("/usr/local/denglin/sdk/lib") add_links("curt", "cublas", "cudnn") @@ -44,8 +54,20 @@ rule("qy.cuda") local sdk_path = "/usr/local/denglin/sdk" local arch = "dlgput64" - local relpath = path.relative(sourcefile, project.directory()) - local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o") + + local relpath = path.relative(sourcefile, os.projectdir()) + + -- 去掉 ..,转成安全路径 + relpath = relpath:gsub("%.%.", "__") + + local objfile = path.join( + config.buildir(), + ".objs", + target:name(), + "rules", + "qy.cuda", + relpath .. ".o" + ) -- 🟢 强制注册 .o 文件给 target target:add("objectfiles", objfile) @@ -153,3 +175,91 @@ target("infiniccl-qy") set_languages("cxx17") target_end() + +target("flash-attn-qy") + set_kind("shared") + set_default(false) + + set_languages("cxx17") + add_cxxflags("-std=c++17") + add_cuflags("--std=c++17", {force = true}) + + -- 🔥 DLCC 规则 + add_rules("qy.cuda", {override = true}) + + if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= false and FLASH_ATTN_ROOT ~= "" then + + -- ⭐⭐⭐ 关键:用 on_load(不是 before_build) + on_load(function (target) + + local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim() + local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim() + local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim() + local LIB_PYTHON = os.iorunv("python", {"-c", "import glob,sysconfig,os;print(glob.glob(os.path.join(sysconfig.get_config_var('LIBDIR'),'libpython*.so'))[0])"}):trim() + + -- ✅ CUDA(最关键) + target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true}) + + -- ✅ flash-attn + target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc") + target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn") + target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn/src") + target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/common") + + -- ✅ torch + target:add("includedirs", TORCH_DIR .. "/include") + target:add("includedirs", TORCH_DIR .. "/include/torch/csrc/api/include") + + -- ⚠️ 很关键:ATen 有些头在这里 + target:add("includedirs", TORCH_DIR .. "/include/TH") + target:add("includedirs", TORCH_DIR .. "/include/THC") + + -- ✅ python + target:add("includedirs", PYTHON_INCLUDE) + + -- ✅ cutlass + if CUTLASS_ROOT then + target:add("includedirs", CUTLASS_ROOT .. "/include") + end + + -- link dirs + target:add("linkdirs", TORCH_DIR .. "/lib") + target:add("linkdirs", PYTHON_LIB_DIR) + target:add("linkdirs", "/usr/local/denglin/sdk/lib") + + -- links + target:add("links", + "curt", + "cublas", + "cudnn", + "torch", + "torch_cpu", + "torch_cuda", + "c10", + "c10_cuda", + "torch_python", + LIB_PYTHON + ) + end) + + -- ✅ C++ host + add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/flash_api.cpp") + + -- ✅ CUDA kernel + add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/src/*.cu") + + -- flags + add_cxflags("-fPIC", {force = true}) + add_cuflags("-O2", "-fPIC", "--expt-relaxed-constexpr", "--use_fast_math", {force = true}) + + add_ldflags("-Wl,--no-undefined", {force = true}) + + else + on_load(function () + print("Flash Attention not available, skipping flash-attn-qy build") + end) + end + + on_install(function (target) end) + +target_end() From 7e5b8017c62ad9054c63b65bf1b6e76bdcec74a6 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Mon, 23 Mar 2026 11:22:45 +0800 Subject: [PATCH 2/2] issue/1090: success link flash-attention.so --- include/infinicore/adaptor/aten_adaptor.hpp | 2 + xmake.lua | 15 ++- xmake/qy.lua | 116 +++++++------------- 3 files changed, 55 insertions(+), 78 deletions(-) diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp index 14711ca1c..8259ec6fc 100644 --- a/include/infinicore/adaptor/aten_adaptor.hpp +++ b/include/infinicore/adaptor/aten_adaptor.hpp @@ -5,8 +5,10 @@ #include +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include #include +#endif namespace infinicore::adaptor { inline at::ScalarType to_at_dtype(DataType dtype) { diff --git a/xmake.lua b/xmake.lua index 822d84531..c803024d4 100644 --- a/xmake.lua +++ b/xmake.lua @@ -247,7 +247,6 @@ if has_config("aten") then end end - -- cuda graph option("graph") set_default(false) @@ -259,7 +258,6 @@ if has_config("graph") then add_defines("USE_INFINIRT_GRAPH") end - -- InfiniCCL option("ccl") set_default(false) @@ -473,6 +471,19 @@ target("infinicore_cpp_api") end end + if get_config("flash-attn") and get_config("flash-attn") ~= "" and has_config("qy-gpu") then + local flash_so_qy = _qy_flash_attn_cuda_so_path() + local flash_dir_qy = path.directory(flash_so_qy) + local flash_name_qy = path.filename(flash_so_qy) + before_link(function (target) + target:add( + "shflags", + "-Wl,--no-as-needed -L" .. flash_dir_qy .. " -l:" .. flash_name_qy .. " -Wl,-rpath," .. flash_dir_qy, + {force = true} + ) + end) + end + before_build(function (target) if has_config("aten") then local outdata = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim() diff --git a/xmake/qy.lua b/xmake/qy.lua index 77469765f..8421eb002 100644 --- a/xmake/qy.lua +++ b/xmake/qy.lua @@ -13,6 +13,35 @@ local FLASH_ATTN_ROOT = get_config("flash-attn") local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") +local FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT = + "/home/shangyouren/miniconda3/envs/xiaobase/lib/python3.12/site-packages/flash_attn_2_cuda.cpython-312-x86_64-linux-gnu.so" + +function _qy_flash_attn_cuda_so_path() + -- Highest priority: override the exact `.so` file to link. + local env_path = os.getenv("FLASH_ATTN_2_CUDA_SO") + if env_path and env_path ~= "" then + env_path = env_path:trim() + if not os.isfile(env_path) then + raise("qy+flash-attn: FLASH_ATTN_2_CUDA_SO is not a file: %s", env_path) + end + return env_path + end + + -- Second priority: allow overriding the "expected" container path via env. + local container_path = os.getenv("FLASH_ATTN_QY_CUDA_SO_CONTAINER") + if not container_path or container_path == "" then + container_path = FLASH_ATTN_QY_CUDA_SO_CONTAINER_DEFAULT + end + + if not os.isfile(container_path) then + raise( + "qy+flash-attn: expected %s\n Install flash-attn in the conda env, or export FLASH_ATTN_2_CUDA_SO to your .so path.", + container_path + ) + end + return container_path +end + add_includedirs("/usr/local/denglin/sdk/include", "../include") add_linkdirs("/usr/local/denglin/sdk/lib") add_links("curt", "cublas", "cudnn") @@ -177,89 +206,24 @@ target("infiniccl-qy") target_end() target("flash-attn-qy") - set_kind("shared") + set_kind("phony") set_default(false) + - set_languages("cxx17") - add_cxxflags("-std=c++17") - add_cuflags("--std=c++17", {force = true}) - - -- 🔥 DLCC 规则 - add_rules("qy.cuda", {override = true}) - - if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= false and FLASH_ATTN_ROOT ~= "" then - - -- ⭐⭐⭐ 关键:用 on_load(不是 before_build) - on_load(function (target) - + if FLASH_ATTN_ROOT and FLASH_ATTN_ROOT ~= "" then + before_build(function (target) + target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true}) local TORCH_DIR = os.iorunv("python", {"-c", "import torch, os; print(os.path.dirname(torch.__file__))"}):trim() local PYTHON_INCLUDE = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_paths()['include'])"}):trim() local PYTHON_LIB_DIR = os.iorunv("python", {"-c", "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"}):trim() - local LIB_PYTHON = os.iorunv("python", {"-c", "import glob,sysconfig,os;print(glob.glob(os.path.join(sysconfig.get_config_var('LIBDIR'),'libpython*.so'))[0])"}):trim() - - -- ✅ CUDA(最关键) - target:add("includedirs", "/usr/local/denglin/sdk/include", {public = true}) - - -- ✅ flash-attn - target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc") - target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn") - target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/flash_attn/src") - target:add("includedirs", FLASH_ATTN_ROOT .. "/csrc/common") - - -- ✅ torch - target:add("includedirs", TORCH_DIR .. "/include") - target:add("includedirs", TORCH_DIR .. "/include/torch/csrc/api/include") - - -- ⚠️ 很关键:ATen 有些头在这里 - target:add("includedirs", TORCH_DIR .. "/include/TH") - target:add("includedirs", TORCH_DIR .. "/include/THC") - - -- ✅ python - target:add("includedirs", PYTHON_INCLUDE) - - -- ✅ cutlass - if CUTLASS_ROOT then - target:add("includedirs", CUTLASS_ROOT .. "/include") - end - - -- link dirs - target:add("linkdirs", TORCH_DIR .. "/lib") - target:add("linkdirs", PYTHON_LIB_DIR) - target:add("linkdirs", "/usr/local/denglin/sdk/lib") - - -- links - target:add("links", - "curt", - "cublas", - "cudnn", - "torch", - "torch_cpu", - "torch_cuda", - "c10", - "c10_cuda", - "torch_python", - LIB_PYTHON - ) + + -- Validate build/runtime env in container and keep these paths available for downstream linking. + target:add("includedirs", TORCH_DIR .. "/include", TORCH_DIR .. "/include/torch/csrc/api/include", PYTHON_INCLUDE, {public = false}) + target:add("linkdirs", TORCH_DIR .. "/lib", PYTHON_LIB_DIR, {public = false}) end) - - -- ✅ C++ host - add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/flash_api.cpp") - - -- ✅ CUDA kernel - add_files(FLASH_ATTN_ROOT .. "/csrc/flash_attn/src/*.cu") - - -- flags - add_cxflags("-fPIC", {force = true}) - add_cuflags("-O2", "-fPIC", "--expt-relaxed-constexpr", "--use_fast_math", {force = true}) - - add_ldflags("-Wl,--no-undefined", {force = true}) - else - on_load(function () - print("Flash Attention not available, skipping flash-attn-qy build") + before_build(function (target) + print("Flash Attention not available, skipping flash-attn-qy integration") end) end - - on_install(function (target) end) - target_end()