From 24927dedc66d5aa1c77d067961cd32d62eeeaf72 Mon Sep 17 00:00:00 2001 From: lengmuzhaxi <2690497440@qq.com> Date: Mon, 23 Mar 2026 16:53:46 +0800 Subject: [PATCH 1/2] issue/1031 merge T1-1-38 --- include/infinicore/ops/flipud.hpp | 19 ++ include/infinicore/ops/float_power.hpp | 68 ++++ include/infinicore/ops/floor_divide.hpp | 16 + include/infinicore/ops/multi_margin_loss.hpp | 19 ++ include/infinicore/ops/scatter.hpp | 21 ++ include/infiniop.h | 5 + include/infiniop/ops/flipud.h | 27 ++ include/infiniop/ops/float_power.h | 27 ++ include/infiniop/ops/floor_divide.h | 26 ++ include/infiniop/ops/multi_margin_loss.h | 30 ++ include/infiniop/ops/scatter.h | 30 ++ python/infinicore/__init__.py | 8 + python/infinicore/nn/functional/__init__.py | 3 +- .../nn/functional/multi_margin_loss.py | 62 ++++ python/infinicore/ops/flipud.py | 28 ++ python/infinicore/ops/float_power.py | 48 +++ python/infinicore/ops/floor_divide.py | 11 + python/infinicore/ops/scatter.py | 56 ++++ src/infinicore/ops/flipud/flipud.cc | 27 ++ src/infinicore/ops/flipud/flipud_infiniop.cc | 62 ++++ src/infinicore/ops/float_power/float_power.cc | 73 +++++ .../ops/float_power/float_power_infiniop.cc | 141 ++++++++ .../ops/floor_divide/floor_divide.cc | 27 ++ .../ops/floor_divide/floor_divide_infiniop.cc | 52 +++ .../multi_margin_loss/multi_margin_loss.cc | 34 ++ .../multi_margin_loss_infiniop.cc | 84 +++++ src/infinicore/ops/scatter/scatter.cc | 26 ++ .../ops/scatter/scatter_infiniop.cc | 73 +++++ src/infinicore/pybind11/ops.hpp | 10 + src/infinicore/pybind11/ops/flipud.hpp | 31 ++ src/infinicore/pybind11/ops/float_power.hpp | 55 ++++ src/infinicore/pybind11/ops/floor_divide.hpp | 26 ++ .../pybind11/ops/multi_margin_loss.hpp | 55 ++++ src/infinicore/pybind11/ops/scatter.hpp | 54 +++ src/infiniop/ops/flipud/cpu/flipud_cpu.cc | 171 ++++++++++ src/infiniop/ops/flipud/cpu/flipud_cpu.h | 8 + src/infiniop/ops/flipud/cuda/kernel.cuh | 95 ++++++ src/infiniop/ops/flipud/flipud.h | 48 +++ src/infiniop/ops/flipud/info.h | 60 ++++ src/infiniop/ops/flipud/metax/flipud_metax.h | 8 + .../ops/flipud/metax/flipud_metax.maca | 247 ++++++++++++++ src/infiniop/ops/flipud/moore/flipud_moore.h | 8 + src/infiniop/ops/flipud/moore/flipud_moore.mu | 156 +++++++++ .../ops/flipud/moore/flipud_moore_kernel.h | 97 ++++++ .../ops/flipud/nvidia/flipud_nvidia.cu | 155 +++++++++ .../ops/flipud/nvidia/flipud_nvidia.cuh | 7 + src/infiniop/ops/flipud/operator.cc | 176 ++++++++++ .../ops/float_power/cpu/float_power_cpu.cc | 148 +++++++++ .../ops/float_power/cpu/float_power_cpu.h | 7 + src/infiniop/ops/float_power/cuda/kernel.cuh | 107 ++++++ src/infiniop/ops/float_power/float_power.h | 52 +++ src/infiniop/ops/float_power/info.h | 83 +++++ .../ops/float_power/metax/float_power_metax.h | 8 + .../float_power/metax/float_power_metax.maca | 309 ++++++++++++++++++ .../ops/float_power/moore/float_power_moore.h | 8 + .../float_power/moore/float_power_moore.mu | 204 ++++++++++++ .../moore/float_power_moore_kernel.h | 147 +++++++++ .../float_power/nvidia/float_power_nvidia.cu | 202 ++++++++++++ .../float_power/nvidia/float_power_nvidia.cuh | 7 + src/infiniop/ops/float_power/operator.cc | 180 ++++++++++ .../ops/floor_divide/cpu/floor_divide_cpu.cc | 58 ++++ .../ops/floor_divide/cpu/floor_divide_cpu.h | 30 ++ src/infiniop/ops/floor_divide/cuda/kernel.cuh | 36 ++ .../floor_divide/metax/floor_divide_metax.h | 8 + .../metax/floor_divide_metax.maca | 124 +++++++ .../floor_divide/moore/floor_divide_moore.h | 8 + .../floor_divide/moore/floor_divide_moore.mu | 69 ++++ .../moore/floor_divide_moore_kernel.h | 39 +++ .../nvidia/floor_divide_nvidia.cu | 65 ++++ .../nvidia/floor_divide_nvidia.cuh | 8 + src/infiniop/ops/floor_divide/operator.cc | 202 ++++++++++++ .../cpu/multi_margin_loss_cpu.cc | 175 ++++++++++ .../cpu/multi_margin_loss_cpu.h | 8 + .../ops/multi_margin_loss/cuda/kernel.cuh | 166 ++++++++++ src/infiniop/ops/multi_margin_loss/info.h | 104 ++++++ .../metax/multi_margin_loss_metax.h | 8 + .../metax/multi_margin_loss_metax.maca | 304 +++++++++++++++++ .../moore/multi_margin_loss_moore.h | 8 + .../moore/multi_margin_loss_moore.mu | 158 +++++++++ .../moore/multi_margin_loss_moore_kernel.h | 195 +++++++++++ .../ops/multi_margin_loss/multi_margin_loss.h | 54 +++ .../nvidia/multi_margin_loss_nvidia.cu | 144 ++++++++ .../nvidia/multi_margin_loss_nvidia.cuh | 7 + .../ops/multi_margin_loss/operator.cc | 184 +++++++++++ src/infiniop/ops/scatter/cpu/scatter_cpu.cc | 195 +++++++++++ src/infiniop/ops/scatter/cpu/scatter_cpu.h | 8 + src/infiniop/ops/scatter/cuda/kernel.cuh | 95 ++++++ src/infiniop/ops/scatter/info.h | 97 ++++++ .../ops/scatter/metax/scatter_metax.h | 8 + .../ops/scatter/metax/scatter_metax.maca | 279 ++++++++++++++++ .../ops/scatter/moore/scatter_moore.h | 8 + .../ops/scatter/moore/scatter_moore.mu | 186 +++++++++++ .../ops/scatter/moore/scatter_moore_kernel.h | 103 ++++++ .../ops/scatter/nvidia/scatter_nvidia.cu | 185 +++++++++++ .../ops/scatter/nvidia/scatter_nvidia.cuh | 8 + src/infiniop/ops/scatter/operator.cc | 186 +++++++++++ src/infiniop/ops/scatter/scatter.h | 53 +++ test/infinicore/ops/flipud.py | 5 +- test/infinicore/ops/float_power.py | 5 +- test/infinicore/ops/floor_divide.py | 5 +- test/infinicore/ops/multi_margin_loss.py | 2 +- test/infinicore/ops/scatter.py | 5 +- test/infinicore/ops/triplet_margin_loss.py | 5 +- 103 files changed, 7615 insertions(+), 17 deletions(-) create mode 100644 include/infinicore/ops/flipud.hpp create mode 100644 include/infinicore/ops/float_power.hpp create mode 100644 include/infinicore/ops/floor_divide.hpp create mode 100644 include/infinicore/ops/multi_margin_loss.hpp create mode 100644 include/infinicore/ops/scatter.hpp create mode 100644 include/infiniop/ops/flipud.h create mode 100644 include/infiniop/ops/float_power.h create mode 100644 include/infiniop/ops/floor_divide.h create mode 100644 include/infiniop/ops/multi_margin_loss.h create mode 100644 include/infiniop/ops/scatter.h create mode 100644 python/infinicore/nn/functional/multi_margin_loss.py create mode 100644 python/infinicore/ops/flipud.py create mode 100644 python/infinicore/ops/float_power.py create mode 100644 python/infinicore/ops/floor_divide.py create mode 100644 python/infinicore/ops/scatter.py create mode 100644 src/infinicore/ops/flipud/flipud.cc create mode 100644 src/infinicore/ops/flipud/flipud_infiniop.cc create mode 100644 src/infinicore/ops/float_power/float_power.cc create mode 100644 src/infinicore/ops/float_power/float_power_infiniop.cc create mode 100644 src/infinicore/ops/floor_divide/floor_divide.cc create mode 100644 src/infinicore/ops/floor_divide/floor_divide_infiniop.cc create mode 100644 src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc create mode 100644 src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc create mode 100644 src/infinicore/ops/scatter/scatter.cc create mode 100644 src/infinicore/ops/scatter/scatter_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/flipud.hpp create mode 100644 src/infinicore/pybind11/ops/float_power.hpp create mode 100644 src/infinicore/pybind11/ops/floor_divide.hpp create mode 100644 src/infinicore/pybind11/ops/multi_margin_loss.hpp create mode 100644 src/infinicore/pybind11/ops/scatter.hpp create mode 100644 src/infiniop/ops/flipud/cpu/flipud_cpu.cc create mode 100644 src/infiniop/ops/flipud/cpu/flipud_cpu.h create mode 100644 src/infiniop/ops/flipud/cuda/kernel.cuh create mode 100644 src/infiniop/ops/flipud/flipud.h create mode 100644 src/infiniop/ops/flipud/info.h create mode 100644 src/infiniop/ops/flipud/metax/flipud_metax.h create mode 100644 src/infiniop/ops/flipud/metax/flipud_metax.maca create mode 100644 src/infiniop/ops/flipud/moore/flipud_moore.h create mode 100644 src/infiniop/ops/flipud/moore/flipud_moore.mu create mode 100644 src/infiniop/ops/flipud/moore/flipud_moore_kernel.h create mode 100644 src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu create mode 100644 src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh create mode 100644 src/infiniop/ops/flipud/operator.cc create mode 100644 src/infiniop/ops/float_power/cpu/float_power_cpu.cc create mode 100644 src/infiniop/ops/float_power/cpu/float_power_cpu.h create mode 100644 src/infiniop/ops/float_power/cuda/kernel.cuh create mode 100644 src/infiniop/ops/float_power/float_power.h create mode 100644 src/infiniop/ops/float_power/info.h create mode 100644 src/infiniop/ops/float_power/metax/float_power_metax.h create mode 100644 src/infiniop/ops/float_power/metax/float_power_metax.maca create mode 100644 src/infiniop/ops/float_power/moore/float_power_moore.h create mode 100644 src/infiniop/ops/float_power/moore/float_power_moore.mu create mode 100644 src/infiniop/ops/float_power/moore/float_power_moore_kernel.h create mode 100644 src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu create mode 100644 src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh create mode 100644 src/infiniop/ops/float_power/operator.cc create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h create mode 100644 src/infiniop/ops/floor_divide/cuda/kernel.cuh create mode 100644 src/infiniop/ops/floor_divide/metax/floor_divide_metax.h create mode 100644 src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca create mode 100644 src/infiniop/ops/floor_divide/moore/floor_divide_moore.h create mode 100644 src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu create mode 100644 src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh create mode 100644 src/infiniop/ops/floor_divide/operator.cc create mode 100644 src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc create mode 100644 src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h create mode 100644 src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh create mode 100644 src/infiniop/ops/multi_margin_loss/info.h create mode 100644 src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h create mode 100644 src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca create mode 100644 src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h create mode 100644 src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu create mode 100644 src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h create mode 100644 src/infiniop/ops/multi_margin_loss/multi_margin_loss.h create mode 100644 src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu create mode 100644 src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh create mode 100644 src/infiniop/ops/multi_margin_loss/operator.cc create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.cc create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.h create mode 100644 src/infiniop/ops/scatter/cuda/kernel.cuh create mode 100644 src/infiniop/ops/scatter/info.h create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.h create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.maca create mode 100644 src/infiniop/ops/scatter/moore/scatter_moore.h create mode 100644 src/infiniop/ops/scatter/moore/scatter_moore.mu create mode 100644 src/infiniop/ops/scatter/moore/scatter_moore_kernel.h create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh create mode 100644 src/infiniop/ops/scatter/operator.cc create mode 100644 src/infiniop/ops/scatter/scatter.h diff --git a/include/infinicore/ops/flipud.hpp b/include/infinicore/ops/flipud.hpp new file mode 100644 index 000000000..9f00cf71c --- /dev/null +++ b/include/infinicore/ops/flipud.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Flipud { +public: + // Schema signature: (Output, Input) + using schema = void (*)(Tensor, Tensor); + + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; +Tensor flipud(Tensor input); +void flipud_(Tensor output, Tensor input); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/float_power.hpp b/include/infinicore/ops/float_power.hpp new file mode 100644 index 000000000..69e0586a1 --- /dev/null +++ b/include/infinicore/ops/float_power.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class FloatPower { +public: + // ========================================================== + // Dispatcher Schemas + // ========================================================== + + // Output = Input ^ Scalar (scalar must be double!) + using schema_scalar = void (*)(Tensor output, + Tensor input, + double exponent); + + // Output = Input ^ Tensor + using schema_tensor = void (*)(Tensor output, + Tensor input, + Tensor exponent); + + // ========================================================== + // Execute Entry Points (called by functional interface) + // ========================================================== + + static void execute(Tensor output, + Tensor input, + double exponent); + + static void execute(Tensor output, + Tensor input, + Tensor exponent); + + // ========================================================== + // Dispatchers + // ========================================================== + + static common::OpDispatcher& dispatcher_scalar(); + static common::OpDispatcher& dispatcher_tensor(); +}; + +// ======================================================================= +// Functional Interface (Python-visible semantics) +// ======================================================================= + +// ------------------------------- +// 1. Scalar Exponent +// ------------------------------- + +// out-of-place: ALWAYS float64 +Tensor float_power(Tensor input, double exponent); + +// in-place +void float_power_(Tensor output, Tensor input, double exponent); + +// ------------------------------- +// 2. Tensor Exponent +// ------------------------------- + +// out-of-place: ALWAYS float64 +Tensor float_power(Tensor input, Tensor exponent); + +// in-place +void float_power_(Tensor output, Tensor input, Tensor exponent); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/floor_divide.hpp b/include/infinicore/ops/floor_divide.hpp new file mode 100644 index 000000000..836652d76 --- /dev/null +++ b/include/infinicore/ops/floor_divide.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class FloorDivide { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor floor_divide(Tensor a, Tensor b); +void floor_divide_(Tensor c, Tensor a, Tensor b); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/multi_margin_loss.hpp b/include/infinicore/ops/multi_margin_loss.hpp new file mode 100644 index 000000000..a1b297114 --- /dev/null +++ b/include/infinicore/ops/multi_margin_loss.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class MultiMarginLoss { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, int64_t, float, int64_t); + + static void execute(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction); + static common::OpDispatcher &dispatcher(); +}; + +Tensor multi_margin_loss(Tensor input, Tensor target, Tensor weight = {}, int64_t p = 1, float margin = 1.0f, int64_t reduction = 1); +void multi_margin_loss_(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/scatter.hpp b/include/infinicore/ops/scatter.hpp new file mode 100644 index 000000000..a9efe6ca2 --- /dev/null +++ b/include/infinicore/ops/scatter.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Scatter { +public: + using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor, int64_t); + + static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction); + static common::OpDispatcher &dispatcher(); +}; + +Tensor scatter(Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction = 0); + +// In-place / 指定 Output 接口 +void scatter_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index 6aaeee6a4..7716487e1 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -60,6 +60,11 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topk.h" +#include "infiniop/ops/floor_divide.h" +#include "infiniop/ops/float_power.h" +#include "infiniop/ops/flipud.h" +#include "infiniop/ops/scatter.h" +#include "infiniop/ops/multi_margin_loss.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/var.h" diff --git a/include/infiniop/ops/flipud.h b/include/infiniop/ops/flipud.h new file mode 100644 index 000000000..6ff33c17c --- /dev/null +++ b/include/infiniop/ops/flipud.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_FLIPUD_API_H__ +#define __INFINIOP_FLIPUD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFlipudDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFlipudDescriptor(infiniopHandle_t handle, + infiniopFlipudDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +// 获取工作空间大小 +__C __export infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size); + +// 执行 Flipud 算子 +__C __export infiniStatus_t infiniopFlipud(infiniopFlipudDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +// 销毁描述符 +__C __export infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc); + +#endif // __INFINIOP_FLIPUD_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/float_power.h b/include/infiniop/ops/float_power.h new file mode 100644 index 000000000..5d8fb9bf5 --- /dev/null +++ b/include/infiniop/ops/float_power.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_FLOAT_POWER_API_H__ +#define __INFINIOP_FLOAT_POWER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFloatPowerDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFloatPowerDescriptor(infiniopHandle_t handle, + infiniopFloatPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent); + +__C __export infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFloatPower(infiniopFloatPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *exponent, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/floor_divide.h b/include/infiniop/ops/floor_divide.h new file mode 100644 index 000000000..4b59a52e5 --- /dev/null +++ b/include/infiniop/ops/floor_divide.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_FLOOR_DIVIDE_API_H__ +#define __INFINIOP_FLOOR_DIVIDE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFloorDivideDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFloorDivideDescriptor(infiniopHandle_t handle, + infiniopFloorDivideDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFloorDivide(infiniopFloorDivideDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/multi_margin_loss.h b/include/infiniop/ops/multi_margin_loss.h new file mode 100644 index 000000000..cc4f9f0eb --- /dev/null +++ b/include/infiniop/ops/multi_margin_loss.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_MULTI_MARGIN_LOSS_API_H__ +#define __INFINIOP_MULTI_MARGIN_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMultiMarginLossDescriptor_t; +__C __export infiniStatus_t infiniopCreateMultiMarginLossDescriptor(infiniopHandle_t handle, + infiniopMultiMarginLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + infiniopTensorDescriptor_t weight, + int p, + float margin, + int reduction); + +__C __export infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMultiMarginLoss(infiniopMultiMarginLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc); + +#endif // __INFINIOP_MULTI_MARGIN_LOSS_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h new file mode 100644 index 000000000..d2b6b992b --- /dev/null +++ b/include/infiniop/ops/scatter.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_SCATTER_API_H__ +#define __INFINIOP_SCATTER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t; + +__C __export infiniStatus_t infiniopCreateScatterDescriptor(infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + infiniopTensorDescriptor_t updates, + int axis, + int reduction); + +__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream); + +__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); + +#endif // __INFINIOP_SCATTER_API_H__ \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 0cc9c2dda..b16f847a5 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -78,6 +78,10 @@ from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal +from infinicore.ops.floor_divide import floor_divide +from infinicore.ops.float_power import float_power +from infinicore.ops.flipud import flipud +from infinicore.ops.scatter import scatter from infinicore.ops.squeeze import squeeze from infinicore.ops.sum import sum from infinicore.ops.topk import topk @@ -175,6 +179,10 @@ "mha_kvcache", "mha_varlen", "fmin", + "floor_divide", + "float_power", + "flipud", + "scatter", "paged_caching", "paged_attention", "paged_attention_prefill", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index f59d909b2..1fbacbb60 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -15,7 +15,7 @@ from .silu import silu from .silu_and_mul import silu_and_mul from .swiglu import swiglu - +from .multi_margin_loss import multi_margin_loss __all__ = [ "adaptive_max_pool1d", "causal_softmax", @@ -35,4 +35,5 @@ "linear_w8a8i8", "silu_and_mul", "adaptive_avg_pool3d", + "multi_margin_loss", ] diff --git a/python/infinicore/nn/functional/multi_margin_loss.py b/python/infinicore/nn/functional/multi_margin_loss.py new file mode 100644 index 000000000..f06bb1be7 --- /dev/null +++ b/python/infinicore/nn/functional/multi_margin_loss.py @@ -0,0 +1,62 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +_REDUCTION_MODES = { + "none": 0, + "mean": 1, + "sum": 2, +} + +def multi_margin_loss( + input: Tensor, + target: Tensor, + p: int = 1, + margin: float = 1.0, + weight: Optional[Tensor] = None, + reduction: str = "mean", + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Creates a criterion that optimizes a multi-class classification hinge + loss (margin-based loss) between input x and output y. + """ + + if not input.is_contiguous(): + input = input.contiguous() + if not target.is_contiguous(): + target = target.contiguous() + + + weight_underlying = None + if weight is not None: + if not weight.is_contiguous(): + weight = weight.contiguous() + weight_underlying = weight._underlying + + # 解析 reduction 参数 + if reduction not in _REDUCTION_MODES: + raise ValueError(f"{reduction} is not a valid value for reduction") + reduction_val = _REDUCTION_MODES[reduction] + if out is not None: + _infinicore.multi_margin_loss_( + out._underlying, + input._underlying, + target._underlying, + weight_underlying, + p, + margin, + reduction_val + ) + return out + + return Tensor( + _infinicore.multi_margin_loss( + input._underlying, + target._underlying, + weight_underlying, + p, + margin, + reduction_val + ) + ) \ No newline at end of file diff --git a/python/infinicore/ops/flipud.py b/python/infinicore/ops/flipud.py new file mode 100644 index 000000000..bdb01ea69 --- /dev/null +++ b/python/infinicore/ops/flipud.py @@ -0,0 +1,28 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def flipud( + input: Tensor, + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Flip array in the up/down direction. + + Flips the entries in axis 0 (preserving the shape). + + Args: + input (Tensor): the input tensor. + out (Tensor, optional): the output tensor. + + Returns: + Tensor: The flipped tensor. + """ + if not input.is_contiguous(): + input = input.contiguous() + if out is not None: + _infinicore.flipud_(out._underlying, input._underlying) + return out + return Tensor( + _infinicore.flipud(input._underlying) + ) \ No newline at end of file diff --git a/python/infinicore/ops/float_power.py b/python/infinicore/ops/float_power.py new file mode 100644 index 000000000..f67b7ac58 --- /dev/null +++ b/python/infinicore/ops/float_power.py @@ -0,0 +1,48 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def float_power( + input: Tensor, + exponent: float, + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Computes the power of each element in input with the given exponent. + + .. math:: + \text{out}_i = \text{input}_i^{\text{exponent}} + + Args: + input (Tensor): the input tensor. + exponent (float): the exponent value. + out (Tensor, optional): the output tensor. + + Returns: + Tensor: The result tensor. + """ + + # 1. 确保输入内存连续 (Contiguous check) + if not input.is_contiguous(): + input = input.contiguous() + + # 2. 分发计算 + # 如果用户提供了 output tensor,调用底层的 in-place/explicit 接口 + if out is not None: + if not out.is_contiguous(): + raise RuntimeError("Output tensor must be contiguous") + + _infinicore.float_power_( + out._underlying, + input._underlying, + exponent + ) + return out + + # 否则调用底层的 functional 接口,返回新 Tensor + return Tensor( + _infinicore.float_power( + input._underlying, + exponent + ) + ) \ No newline at end of file diff --git a/python/infinicore/ops/floor_divide.py b/python/infinicore/ops/floor_divide.py new file mode 100644 index 000000000..1d76e0c05 --- /dev/null +++ b/python/infinicore/ops/floor_divide.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def floor_divide(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.floor_divide(input._underlying, other._underlying)) + + _infinicore.floor_divide_(out._underlying, input._underlying, other._underlying) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/scatter.py b/python/infinicore/ops/scatter.py new file mode 100644 index 000000000..bc4a6c969 --- /dev/null +++ b/python/infinicore/ops/scatter.py @@ -0,0 +1,56 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +# Scatter 算子常用的 reduction 模式 +_SCATTER_REDUCTION_MODES = { + "none": 0, # 直接赋值/覆盖 + "add": 1, # 累加 + "multiply": 2, # 累乘 +} +def scatter( + input: Tensor, + index: Tensor, + src: Tensor, + dim: int, + reduction: str = "none", + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Writes all values from the tensor src into input at the indices specified in the index tensor. + """ + + if not input.is_contiguous(): + input = input.contiguous() + if not index.is_contiguous(): + index = index.contiguous() + if not src.is_contiguous(): + src = src.contiguous() + + # 解析 reduction 参数 + if reduction not in _SCATTER_REDUCTION_MODES: + raise ValueError(f"{reduction} is not a valid value for reduction") + reduction_val = _SCATTER_REDUCTION_MODES[reduction] + + # In-place 分支 (scatter_) + if out is not None: + _infinicore.scatter_( + out._underlying, + input._underlying, + index._underlying, # index (第3个) + src._underlying, # src (第4个) + dim, # dim (第5个) + reduction_val + ) + return out + + # Out-of-place 分支 (scatter) + return Tensor( + _infinicore.scatter( + input._underlying, + index._underlying, # index (第2个) + src._underlying, # src (第3个) + dim, # dim (第4个) + reduction_val + ) + ) \ No newline at end of file diff --git a/src/infinicore/ops/flipud/flipud.cc b/src/infinicore/ops/flipud/flipud.cc new file mode 100644 index 000000000..3d1ea08fb --- /dev/null +++ b/src/infinicore/ops/flipud/flipud.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/flipud.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &Flipud::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +// 2. 静态执行函数 +void Flipud::execute(Tensor output, Tensor input) { + dispatcher().lookup(context::getDevice().getType())(output, input); +} +Tensor flipud(Tensor input) { + // Flipud 操作不改变张量的形状和数据类型 + // Output shape == Input shape + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + + flipud_(output, input); + return output; +} +void flipud_(Tensor output, Tensor input) { + Flipud::execute(output, input); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/flipud/flipud_infiniop.cc b/src/infinicore/ops/flipud/flipud_infiniop.cc new file mode 100644 index 000000000..eaf5651ce --- /dev/null +++ b/src/infinicore/ops/flipud/flipud_infiniop.cc @@ -0,0 +1,62 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/flipud.hpp" +#include + +namespace infinicore::op::flipud_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopFlipudDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyFlipudDescriptor(desc)); + desc = nullptr; + } + }); + +// 执行函数 +void calculate(Tensor output, Tensor input) { + // 1. 计算缓存 Key + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFlipudDescriptor_t desc = nullptr; + + // 2. 获取或创建描述符 + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateFlipudDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc() + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetFlipudWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + INFINICORE_CHECK_ERROR(infiniopFlipud( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream() + )); +} + +static bool registered = []() { + Flipud::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::flipud_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/float_power/float_power.cc b/src/infinicore/ops/float_power/float_power.cc new file mode 100644 index 000000000..9ef406e16 --- /dev/null +++ b/src/infinicore/ops/float_power/float_power.cc @@ -0,0 +1,73 @@ +#include "infinicore/ops/float_power.hpp" +#include "infinicore/tensor.hpp" + +namespace infinicore::op { + +// ======================================================================= +// 1. Dispatcher 单例 +// ======================================================================= + +common::OpDispatcher& FloatPower::dispatcher_scalar() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +common::OpDispatcher& FloatPower::dispatcher_tensor() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +// ======================================================================= +// 2. Execute (执行入口) +// ======================================================================= + +void FloatPower::execute(Tensor output, Tensor input, double exponent) { + dispatcher_scalar() + .lookup(context::getDevice().getType())(output, input, exponent); +} + +void FloatPower::execute(Tensor output, Tensor input, Tensor exponent) { + dispatcher_tensor() + .lookup(context::getDevice().getType())(output, input, exponent); +} + +// ======================================================================= +// 3. Functional interface (out-of-place) -> 强制提升为 F64 +// ======================================================================= + +Tensor float_power(Tensor input, double exponent) { + auto output = Tensor::empty( + input->shape(), + infinicore::DataType::F64, + input->device() + ); + + float_power_(output, input, exponent); + return output; +} + +Tensor float_power(Tensor input, Tensor exponent) { + Shape output_shape = input->shape(); + auto output = Tensor::empty( + output_shape, + infinicore::DataType::F64, + input->device() + ); + + float_power_(output, input, exponent); + return output; +} + +// ======================================================================= +// 4. Explicit / in-place +// ======================================================================= + +void float_power_(Tensor output, Tensor input, double exponent) { + FloatPower::execute(output, input, exponent); +} + +void float_power_(Tensor output, Tensor input, Tensor exponent) { + FloatPower::execute(output, input, exponent); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/float_power/float_power_infiniop.cc b/src/infinicore/ops/float_power/float_power_infiniop.cc new file mode 100644 index 000000000..4bb47a655 --- /dev/null +++ b/src/infinicore/ops/float_power/float_power_infiniop.cc @@ -0,0 +1,141 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/float_power.hpp" +#include + +namespace infinicore::op::float_power_impl::infiniop { + +// ======================================================================= +// Descriptor Cache +// ======================================================================= + +thread_local common::OpCache caches( + 100, + [](infiniopFloatPowerDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR( + infiniopDestroyFloatPowerDescriptor(desc)); + desc = nullptr; + } + } +); + +// ======================================================================= +// 1. Scalar Exponent +// ======================================================================= + +void calculate_scalar(Tensor output, + Tensor input, + double exponent) +{ + // Hash: output / input meta + double exponent + size_t seed = hash_combine(output, input, exponent); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFloatPowerDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR( + infiniopCreateFloatPowerDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + nullptr, // exponent tensor descriptor = null + static_cast(exponent) + ) + ); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR( + infiniopGetFloatPowerWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace = + context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR( + infiniopFloatPower( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + nullptr, // exponent data pointer = null + context::getStream() + ) + ); +} + +// ======================================================================= +// 2. Tensor Exponent +// ======================================================================= + +void calculate_tensor(Tensor output, + Tensor input, + Tensor exponent) +{ + size_t seed = hash_combine(output, input, exponent); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFloatPowerDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR( + infiniopCreateFloatPowerDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + exponent->desc(), // tensor exponent + 0.0f // scalar ignored + ) + ); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR( + infiniopGetFloatPowerWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace = + context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR( + infiniopFloatPower( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + exponent->data(), + context::getStream() + ) + ); +} + +// ======================================================================= +// 3. Dispatcher Registration +// ======================================================================= + +static bool registered = []() { + FloatPower::dispatcher_scalar().registerAll(&calculate_scalar, false); + FloatPower::dispatcher_tensor().registerAll(&calculate_tensor, false); + return true; +}(); + +} // namespace infinicore::op::float_power_impl::infiniop diff --git a/src/infinicore/ops/floor_divide/floor_divide.cc b/src/infinicore/ops/floor_divide/floor_divide.cc new file mode 100644 index 000000000..9a1ed9d33 --- /dev/null +++ b/src/infinicore/ops/floor_divide/floor_divide.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/floor_divide.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &FloorDivide::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void FloorDivide::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor floor_divide(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); + floor_divide_(c, a, b); + return c; +} + +void floor_divide_(Tensor c, Tensor a, Tensor b) { + FloorDivide::execute(c, a, b); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc b/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc new file mode 100644 index 000000000..f4caeeb79 --- /dev/null +++ b/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/floor_divide.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::floor_divide_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopFloorDivideDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyFloorDivideDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, b, a); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFloorDivideDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateFloorDivideDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetFloorDivideWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopFloorDivide( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + FloorDivide::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::floor_divide_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc b/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc new file mode 100644 index 000000000..31bd9f3f6 --- /dev/null +++ b/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc @@ -0,0 +1,34 @@ +#include "infinicore/ops/multi_margin_loss.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &MultiMarginLoss::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MultiMarginLoss::execute(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction) { + dispatcher().lookup(context::getDevice().getType())(output, input, target, weight, p, margin, reduction); +} + +// 3. 函数式接口 +Tensor multi_margin_loss(Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction) { + Shape output_shape; + if (reduction == 0) { // None + output_shape = {input->shape()[0]}; + } else { + output_shape = {}; // Scalar + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + + multi_margin_loss_(output, input, target, weight, p, margin, reduction); + return output; +} + +void multi_margin_loss_(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction) { + MultiMarginLoss::execute(output, input, target, weight, p, margin, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc b/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc new file mode 100644 index 000000000..0ae3a1590 --- /dev/null +++ b/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc @@ -0,0 +1,84 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/multi_margin_loss.hpp" +#include + +namespace infinicore::op::multi_margin_loss_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopMultiMarginLossDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyMultiMarginLossDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction) { + bool has_weight = static_cast(weight); + size_t seed; + if (has_weight) { + seed = hash_combine(output, input, target, weight, p, margin, reduction); + } else { + seed = hash_combine(output, input, target, size_t(0), p, margin, reduction); + } + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopMultiMarginLossDescriptor_t desc = nullptr; + infiniopTensorDescriptor_t weight_desc = nullptr; + const void* weight_data = nullptr; + + if (has_weight) { + weight_desc = weight->desc(); + weight_data = weight->data(); + } + + if (!desc_opt) { + // 创建描述符 + INFINICORE_CHECK_ERROR(infiniopCreateMultiMarginLossDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + target->desc(), + weight_desc, + static_cast(p), + margin, + static_cast(reduction) + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetMultiMarginLossWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopMultiMarginLoss( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + target->data(), + weight_data, + context::getStream() + )); +} + +static bool registered = []() { + MultiMarginLoss::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::multi_margin_loss_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/scatter/scatter.cc b/src/infinicore/ops/scatter/scatter.cc new file mode 100644 index 000000000..3abd8542a --- /dev/null +++ b/src/infinicore/ops/scatter/scatter.cc @@ -0,0 +1,26 @@ +#include "infinicore/ops/scatter.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Scatter::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Scatter::execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction) { + dispatcher().lookup(context::getDevice().getType())(output, input, dim, index, src, reduction); +} + +Tensor scatter(Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction) { + // 创建与 input 形状、数据类型、设备一致的 Output Tensor + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + scatter_(output, input, dim, index, src, reduction); + + return output; +} + +void scatter_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction) { + Scatter::execute(output, input, dim, index, src, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/scatter/scatter_infiniop.cc b/src/infinicore/ops/scatter/scatter_infiniop.cc new file mode 100644 index 000000000..8125907b6 --- /dev/null +++ b/src/infinicore/ops/scatter/scatter_infiniop.cc @@ -0,0 +1,73 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/scatter.hpp" +#include + +namespace infinicore::op::scatter_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopScatterDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyScatterDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction) { + // Scatter 算子输入 input, index, src 均为必须存在的 Tensor,直接参与 hash + size_t seed = hash_combine(output, input, dim, index, src, reduction); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopScatterDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 3. 创建描述符 + // C++ Op 参数: output, input, dim, index, src, reduction + // C API 参数: output, input, indices, updates, axis, reduction + INFINICORE_CHECK_ERROR(infiniopCreateScatterDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + index->desc(), // 对应 C API indices + src->desc(), // 对应 C API updates + static_cast(dim), + static_cast(reduction) + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 4. 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetScatterWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopScatter( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + index->data(), + src->data(), + context::getStream() + )); +} + +static bool registered = []() { + Scatter::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::scatter_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index c950f90e5..2dbf6299b 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -50,6 +50,11 @@ #include "ops/topk.hpp" #include "ops/var.hpp" #include "ops/var_mean.hpp" +#include "ops/floor_divide.hpp" +#include "ops/float_power.hpp" +#include "ops/flipud.hpp" +#include "ops/multi_margin_loss.hpp" +#include "ops/scatter.hpp" namespace py = pybind11; @@ -91,6 +96,11 @@ inline void bind(py::module &m) { bind_silu(m); bind_swiglu(m); bind_rope(m); + bind_floor_divide(m); + bind_float_power(m); + bind_flipud(m); + bind_multi_margin_loss(m); + bind_scatter(m); bind_embedding(m); bind_linear_w8a8i8(m); bind_silu_and_mul(m); diff --git a/src/infinicore/pybind11/ops/flipud.hpp b/src/infinicore/pybind11/ops/flipud.hpp new file mode 100644 index 000000000..585bc636a --- /dev/null +++ b/src/infinicore/pybind11/ops/flipud.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "infinicore/ops/flipud.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_flipud(py::module &m) { + // 1. 绑定 out-of-place 接口: output = flipud(input) + m.def("flipud", + &op::flipud, + py::arg("input"), + R"doc(Flip array in the up/down direction. + + Flips the entries in axis 0 (preserving the shape). + + Args: + input (Tensor): The input tensor. + )doc"); + + // 2. 绑定 explicit output 接口: flipud_(output, input) + m.def("flipud_", + &op::flipud_, + py::arg("output"), + py::arg("input"), + R"doc(Explicit output FlipUD operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/float_power.hpp b/src/infinicore/pybind11/ops/float_power.hpp new file mode 100644 index 000000000..242626e9d --- /dev/null +++ b/src/infinicore/pybind11/ops/float_power.hpp @@ -0,0 +1,55 @@ +#include "../tensor.hpp" +#include +#include "infinicore/ops/float_power.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +using infinicore::Tensor; +using infinicore::op::float_power; +using infinicore::op::float_power_; + +inline Tensor unwrap(py::handle obj) { + try { + return obj.cast(); + } catch (...) {} + + if (py::hasattr(obj, "_underlying")) { + return obj.attr("_underlying").cast(); + } + + throw py::type_error("Expected infinicore.Tensor, but got " + py::repr(obj.get_type()).cast()); +} + +void bind_float_power(py::module &m) { + + // --- Out-of-place: float_power(input, exponent) --- + m.def("float_power", [](py::object input_obj, py::object exp_obj) -> Tensor { + Tensor input = unwrap(input_obj); + + // 处理标量指数的情况 (float 或 int) + if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { + return float_power(input, exp_obj.cast()); + } + + // 处理张量指数的情况 + Tensor exponent = unwrap(exp_obj); + return float_power(input, exponent); + }, py::arg("input"), py::arg("exponent")); + + // --- In-place: float_power_(out, input, exponent) --- + m.def("float_power_", [](py::object out_obj, py::object input_obj, py::object exp_obj) { + Tensor out = unwrap(out_obj); + Tensor input = unwrap(input_obj); + + if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { + float_power_(out, input, exp_obj.cast()); + } else { + Tensor exponent = unwrap(exp_obj); + float_power_(out, input, exponent); + } + }, py::arg("out"), py::arg("input"), py::arg("exponent")); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/floor_divide.hpp b/src/infinicore/pybind11/ops/floor_divide.hpp new file mode 100644 index 000000000..3bcec31ee --- /dev/null +++ b/src/infinicore/pybind11/ops/floor_divide.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/floor_divide.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_floor_divide(py::module &m) { + m.def("floor_divide", + &op::floor_divide, + py::arg("a"), + py::arg("b"), + R"doc(Floor division of two tensors.)doc"); + + m.def("floor_divide_", + &op::floor_divide_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place tensor floor division.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/multi_margin_loss.hpp b/src/infinicore/pybind11/ops/multi_margin_loss.hpp new file mode 100644 index 000000000..d55f0d723 --- /dev/null +++ b/src/infinicore/pybind11/ops/multi_margin_loss.hpp @@ -0,0 +1,55 @@ +#pragma once + +#include +#include "infinicore/ops/multi_margin_loss.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_multi_margin_loss(py::module &m) { + m.def("multi_margin_loss", + [](const Tensor& input, const Tensor& target, py::object weight, int p, float margin, int reduction) { + Tensor weight_tensor; + if (!weight.is_none()) { + weight_tensor = weight.cast(); + } + return op::multi_margin_loss(input, target, weight_tensor, p, margin, reduction); + }, + py::arg("input"), + py::arg("target"), + py::arg("weight") = py::none(), // Python 端看到默认值是 None + py::arg("p") = 1, + py::arg("margin") = 1.0f, + py::arg("reduction") = 1, + R"doc(Computes the Multi Margin Loss between input and target. + + Args: + input (Tensor): Input tensor of shape (N, C). + target (Tensor): Ground truth labels of shape (N,). + weight (Tensor, optional): Manual rescaling weight given to each class. If given, has to be a Tensor of size C. + p (int, optional): The norm degree for pairwise distance. p=1 or p=2. Default: 1. + margin (float, optional): Margin value. Default: 1.0. + reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1. + )doc"); + + m.def("multi_margin_loss_", + [](Tensor& output, const Tensor& input, const Tensor& target, py::object weight, int p, float margin, int reduction) { + Tensor weight_tensor; + if (!weight.is_none()) { + weight_tensor = weight.cast(); + } + // 调用底层 + op::multi_margin_loss_(output, input, target, weight_tensor, p, margin, reduction); + }, + py::arg("output"), + py::arg("input"), + py::arg("target"), + py::arg("weight") = py::none(), + py::arg("p") = 1, + py::arg("margin") = 1.0f, + py::arg("reduction") = 1, + R"doc(Explicit output Multi Margin Loss operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/scatter.hpp b/src/infinicore/pybind11/ops/scatter.hpp new file mode 100644 index 000000000..149e4ba81 --- /dev/null +++ b/src/infinicore/pybind11/ops/scatter.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include "infinicore/ops/scatter.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_scatter(py::module &m) { + // ========================================================================= + // 1. 绑定 out-of-place 接口: scatter + // ========================================================================= + // 为了匹配测试脚本的行为(将所有 Tensor 作为位置参数传入,属性作为 kwargs 传入), + // 我们将参数顺序调整为: input, index, src, dim, reduction + // ========================================================================= + m.def("scatter", + [](const Tensor& input, const Tensor& index, const Tensor& src, int64_t dim, int64_t reduction) { + // 调用底层 C++ 实现时,必须恢复正确的参数顺序: (input, dim, index, src, reduction) + return op::scatter(input, dim, index, src, reduction); + }, + py::arg("input"), + py::arg("index"), + py::arg("src"), + py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 + py::arg("reduction") = 0, + R"doc( + Scatter operator. + Note: Parameter order in this binding is adapted for the test runner: (input, index, src, dim, reduction). + )doc"); + + // ========================================================================= + // 2. 绑定 in-place 接口: scatter_ + // ========================================================================= + // 参数顺序调整为: output, input, index, src, dim, reduction + // ========================================================================= + m.def("scatter_", + [](Tensor& output, const Tensor& input, const Tensor& index, const Tensor& src, int64_t dim, int64_t reduction) { + // 调用底层 C++ 实现 + op::scatter_(output, input, dim, index, src, reduction); + }, + py::arg("output"), + py::arg("input"), + py::arg("index"), + py::arg("src"), + py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 + py::arg("reduction") = 0, + R"doc( + In-place Scatter operator. + Writes result into output. + )doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/flipud/cpu/flipud_cpu.cc b/src/infiniop/ops/flipud/cpu/flipud_cpu.cc new file mode 100644 index 000000000..911fbb22c --- /dev/null +++ b/src/infiniop/ops/flipud/cpu/flipud_cpu.cc @@ -0,0 +1,171 @@ +#include "flipud_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include + +// 引用框架定义的 float16/bfloat16 类型支持 +#include "../../../../utils/custom_types.h" + +namespace op::flipud::cpu { + +// ================================================================== +// 0. 定义 Opaque 结构体 +// ================================================================== +struct Descriptor::Opaque { + std::vector shape; + std::vector in_strides; + std::vector out_strides; + int ndim; +}; + +// ================================================================== +// 1. 析构函数 +// ================================================================== +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +// ================================================================== +// 2. 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建 Info + auto result = FlipudInfo::create(out_desc, input_desc); + CHECK_RESULT(result); + + // 2. 创建并填充 Opaque + auto opaque = new Descriptor::Opaque(); + opaque->ndim = static_cast(input_desc->ndim()); + + const auto& shape = input_desc->shape(); + const auto& in_strides = input_desc->strides(); + const auto& out_strides = out_desc->strides(); + + for(int i = 0; i < opaque->ndim; ++i) { + opaque->shape.push_back(shape[i]); + opaque->in_strides.push_back(in_strides[i]); + opaque->out_strides.push_back(out_strides[i]); + } + + // 3. 创建 Descriptor + *desc_ptr = new Descriptor( + opaque, + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 3. 核心计算逻辑 implementation +// ================================================================== +// [修正] 直接接收具体参数,避开 Descriptor::Opaque 的私有权限问题 +template +void calculate_cpu_impl( + int ndim, + const std::vector& shape, + const std::vector& in_strides, + const std::vector& out_strides, + size_t numel, + void *output, + const void *input) { + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + + // 维度 0 的大小 + int64_t dim0_size = shape[0]; + + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < numel; ++i) { + // --- A. 坐标反解 --- + std::vector coords(ndim); + + size_t temp_idx = i; + for (int d = ndim - 1; d >= 0; --d) { + coords[d] = temp_idx % shape[d]; + temp_idx /= shape[d]; + } + + // --- B. 计算输出偏移量 --- + size_t out_offset = 0; + for (int d = 0; d < ndim; ++d) { + out_offset += coords[d] * out_strides[d]; + } + + // --- C. 翻转逻辑 (Flip Axis 0) --- + coords[0] = dim0_size - 1 - coords[0]; + + // --- D. 计算输入偏移量 --- + size_t in_offset = 0; + for (int d = 0; d < ndim; ++d) { + in_offset += coords[d] * in_strides[d]; + } + + // --- E. 数据搬运 --- + out_ptr[out_offset] = in_ptr[in_offset]; + } +} + +// ================================================================== +// 4. 执行计算 (Calculate 分发) +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype(); + size_t numel = _info.numel(); + + // 显式 Switch-Case 分发 + // 在这里解包 _opaque,因为 calculate 是成员函数,可以访问 private 的 _opaque + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl( + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + numel, output, input); + break; + + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl( + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + numel, output, input); + break; + + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl( + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + numel, output, input); + break; + + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl( + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + numel, output, input); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::flipud::cpu \ No newline at end of file diff --git a/src/infiniop/ops/flipud/cpu/flipud_cpu.h b/src/infiniop/ops/flipud/cpu/flipud_cpu.h new file mode 100644 index 000000000..eff0b8020 --- /dev/null +++ b/src/infiniop/ops/flipud/cpu/flipud_cpu.h @@ -0,0 +1,8 @@ +#ifndef __FLIPUD_CPU_H__ +#define __FLIPUD_CPU_H__ + +#include "../flipud.h" + +DESCRIPTOR(cpu) + +#endif // __FLIPUD_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/cuda/kernel.cuh b/src/infiniop/ops/flipud/cuda/kernel.cuh new file mode 100644 index 000000000..e5d436921 --- /dev/null +++ b/src/infiniop/ops/flipud/cuda/kernel.cuh @@ -0,0 +1,95 @@ +#ifndef __FLIPUD_CUDA_CUH__ +#define __FLIPUD_CUDA_CUH__ +#include +#include +#include + +#include + +namespace op::flipud::cuda { + +constexpr int MAX_DIMS = 8; + +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +struct TensorLayout { + int ndim; + size_t shape[MAX_DIMS]; + size_t in_strides[MAX_DIMS]; + size_t out_strides[MAX_DIMS]; +}; + +__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout& layout, size_t* coords) { + size_t temp = index; + #pragma unroll + for (int i = layout.ndim - 1; i >= 0; --i) { + coords[i] = temp % layout.shape[i]; + temp /= layout.shape[i]; + } +} + +__device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const size_t* strides, int ndim) { + size_t offset = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +template +__global__ void flipud_kernel( + T * __restrict__ output, + const T * __restrict__ input, + size_t numel, + TensorLayout layout) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numel) { + size_t coords[MAX_DIMS]; + index_to_coords(idx, layout, coords); + + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); + + coords[0] = layout.shape[0] - 1 - coords[0]; + + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); + + output[out_offset] = input[in_offset]; + } +} + +template +__global__ void flipud_kernel_vectorized( + T * __restrict__ output, + const T * __restrict__ input, + size_t num_packs, + TensorLayout layout) { + + using PackType = Pack; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + size_t scalar_idx = idx * PackSize; + size_t coords[MAX_DIMS]; + + index_to_coords(scalar_idx, layout, coords); + + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); + + coords[0] = layout.shape[0] - 1 - coords[0]; + + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); + + *reinterpret_cast(output + out_offset) = + *reinterpret_cast(input + in_offset); + } +} + +} // namespace op::flipud::cuda + +#endif // __FLIPUD_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/flipud.h b/src/infiniop/ops/flipud/flipud.h new file mode 100644 index 000000000..87b83f5d4 --- /dev/null +++ b/src/infiniop/ops/flipud/flipud.h @@ -0,0 +1,48 @@ +#ifndef __FLIPUD_H__ +#define __FLIPUD_H__ + +#include "../../operator.h" +#include "info.h" + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +// 适配 Flipud 的单输入单输出模式 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::flipud::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + FlipudInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + FlipudInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __FLIPUD_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/info.h b/src/infiniop/ops/flipud/info.h new file mode 100644 index 000000000..655bd91d4 --- /dev/null +++ b/src/infiniop/ops/flipud/info.h @@ -0,0 +1,60 @@ +#ifndef __FLIPUD_INFO_H__ +#define __FLIPUD_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::flipud { + +class FlipudInfo { + FlipudInfo() = default; + +public: + int _dtype; + int _ndim; + size_t _numel; + + int dtype() const { return _dtype; } + int ndim() const { return _ndim; } + size_t numel() const { return _numel; } + + FlipudInfo(int dtype, int ndim, size_t numel) + : _dtype(dtype), _ndim(ndim), _numel(numel) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc) { + + if (out_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (out_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() < 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto &in_shape = input_desc->shape(); + const auto &out_shape = out_desc->shape(); + + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (in_shape[i] != out_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + return utils::Result(FlipudInfo{ + input_desc->dtype(), + static_cast(input_desc->ndim()), + input_desc->numel() + }); + } +}; + +} // namespace op::flipud + +#endif // __FLIPUD_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/metax/flipud_metax.h b/src/infiniop/ops/flipud/metax/flipud_metax.h new file mode 100644 index 000000000..5b8e66cab --- /dev/null +++ b/src/infiniop/ops/flipud/metax/flipud_metax.h @@ -0,0 +1,8 @@ +#ifndef __FLIPUD_METAX_API_H__ +#define __FLIPUD_METAX_API_H__ + +#include "../flipud.h" + +DESCRIPTOR(metax) + +#endif // __FLIPUD_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/metax/flipud_metax.maca b/src/infiniop/ops/flipud/metax/flipud_metax.maca new file mode 100644 index 000000000..0fb8e504d --- /dev/null +++ b/src/infiniop/ops/flipud/metax/flipud_metax.maca @@ -0,0 +1,247 @@ +#include "flipud_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include + +namespace op::flipud::metax { + +constexpr int MAX_DIMS = 4; + +struct TensorLayout { + int ndim; + int shape[MAX_DIMS]; + int in_strides[MAX_DIMS]; + int out_strides[MAX_DIMS]; +}; + +template struct VectorType; +template <> struct VectorType<16> { using type = int4; }; // 128-bit +template <> struct VectorType<8> { using type = int2; }; // 64-bit +template <> struct VectorType<4> { using type = int; }; // 32-bit + +// --------------------------------------------------------- + +__device__ inline size_t get_offset(int idx, const int* strides, int ndim, const int* shape) { + size_t offset = 0; + int rem = idx; + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + int dim_sz = shape[i]; + int pos = rem % dim_sz; + rem /= dim_sz; + offset += pos * strides[i]; + } + return offset; +} + +__device__ inline size_t get_flipud_src_offset(int idx, const int* strides, int ndim, const int* shape) { + size_t offset = 0; + int rem = idx; + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + int dim_sz = shape[i]; + int pos = rem % dim_sz; + rem /= dim_sz; + + if (i == 0) { + pos = dim_sz - 1 - pos; + } + offset += pos * strides[i]; + } + return offset; +} + +// 标量 Kernel +template +__global__ void flipud_kernel( + T* dst, const T* src, size_t n, TensorLayout layout) +{ + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + size_t dst_off = get_offset(idx, layout.out_strides, layout.ndim, layout.shape); + size_t src_off = get_flipud_src_offset(idx, layout.in_strides, layout.ndim, layout.shape); + + dst[dst_off] = src[src_off]; +} + +// 向量化 Kernel +template +__global__ void flipud_kernel_vectorized( + T* dst, const T* src, size_t num_packs, TensorLayout layout) +{ + // [修正] 使用 int4/int2 替代 aligned_storage + using VecT = typename VectorType::type; + + size_t pack_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pack_idx >= num_packs) return; + + int strides_in[MAX_DIMS], strides_out[MAX_DIMS], shape[MAX_DIMS]; + + #pragma unroll + for(int i=0; i 0) { + shape[layout.ndim-1] /= PackSize; + } + + size_t dst_pack_off = get_offset(pack_idx, strides_out, layout.ndim, shape); + size_t src_pack_off = get_flipud_src_offset(pack_idx, strides_in, layout.ndim, shape); + + // 强转为向量类型进行读写 + const VecT* src_vec = reinterpret_cast(src); + VecT* dst_vec = reinterpret_cast(dst); + + dst_vec[dst_pack_off] = src_vec[src_pack_off]; +} + +static inline bool is_pointer_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +struct Descriptor::Opaque { + TensorLayout layout; +}; + +template +void launch_kernel( + void *output, const void *input, + TensorLayout layout, + size_t numel, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto mc_stream = reinterpret_cast(stream); + + constexpr int TotalBytes = 16; + constexpr int PackSize = TotalBytes / sizeof(T); + + // ---------------- Check Vectorization ---------------- + bool is_ptr_aligned = is_pointer_aligned(output, TotalBytes) && is_pointer_aligned(input, TotalBytes); + bool is_numel_divisible = (numel % PackSize == 0); + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + + bool is_inner_contiguous = false; + if (layout.ndim > 0) { + if (layout.in_strides[layout.ndim-1] == 1 && layout.out_strides[layout.ndim-1] == 1) { + is_inner_contiguous = true; + } + } + + bool is_stride_aligned = true; + for (int i = 0; i < layout.ndim - 1; ++i) { + if (layout.in_strides[i] % PackSize != 0 || layout.out_strides[i] % PackSize != 0) { + is_stride_aligned = false; + break; + } + } + + // [逻辑正确] 1D Tensor 禁止向量化 + bool is_dim_safe = (layout.ndim > 1); + + bool can_vectorize = (PackSize > 1) && + is_ptr_aligned && + is_numel_divisible && + is_last_dim_aligned && + is_inner_contiguous && + is_stride_aligned && + is_dim_safe; + + if (can_vectorize) { + size_t num_packs = numel / PackSize; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + flipud_kernel_vectorized + <<>>(out_ptr, in_ptr, num_packs, layout); + } else { + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + flipud_kernel + <<>>(out_ptr, in_ptr, numel, layout); + } +} + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto info_result = FlipudInfo::create(out_desc, input_desc); + if (!info_result) return info_result.status(); + + auto opaque = new Opaque(); + opaque->layout.ndim = static_cast(input_desc->ndim()); + + if (opaque->layout.ndim > MAX_DIMS) { + delete opaque; + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto& shape = input_desc->shape(); + const auto& in_strides = input_desc->strides(); + const auto& out_strides = out_desc->strides(); + + for (int i = 0; i < opaque->layout.ndim; ++i) { + opaque->layout.shape[i] = shape[i]; + opaque->layout.in_strides[i] = in_strides[i]; + opaque->layout.out_strides[i] = out_strides[i]; + } + + *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, void *stream) const { + + auto dtype = _info.dtype(); + auto numel = _info.numel(); + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__maca_bfloat16>(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_I32: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::flipud::metax \ No newline at end of file diff --git a/src/infiniop/ops/flipud/moore/flipud_moore.h b/src/infiniop/ops/flipud/moore/flipud_moore.h new file mode 100644 index 000000000..ac76e968c --- /dev/null +++ b/src/infiniop/ops/flipud/moore/flipud_moore.h @@ -0,0 +1,8 @@ +#ifndef __FLIPUD_MOORE_H__ +#define __FLIPUD_MOORE_H__ + +#include "../flipud.h" + +DESCRIPTOR(moore) + +#endif // __FLIPUD_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/moore/flipud_moore.mu b/src/infiniop/ops/flipud/moore/flipud_moore.mu new file mode 100644 index 000000000..44b1814e1 --- /dev/null +++ b/src/infiniop/ops/flipud/moore/flipud_moore.mu @@ -0,0 +1,156 @@ +#include "flipud_moore.h" +#include "flipud_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include +#include + +namespace op::flipud::moore { + +// ================================================================== +// 辅助函数 +// ================================================================== +static inline bool is_pointer_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Opaque 定义:存储 Tensor Layout +// ================================================================== +struct Descriptor::Opaque { + op::flipud::moore::TensorLayout layout; +}; + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, const void *input, + op::flipud::moore::TensorLayout layout, + size_t numel, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto musa_stream = reinterpret_cast(stream); + + constexpr int TotalBytes = 16; // 128-bit + constexpr int PackSize = TotalBytes / sizeof(T); + + // ------------------------------------------ + // 向量化判定 (Vectorization Check) + // ------------------------------------------ + // 1. 指针地址对齐 + bool is_ptr_aligned = is_pointer_aligned(output, TotalBytes) && is_pointer_aligned(input, TotalBytes); + + // 2. 元素总数必须是 PackSize 的倍数 + bool is_numel_divisible = (numel % PackSize == 0); + + // 3. 最后一维大小必须是 PackSize 的倍数 (保证 Pack 不会跨行读取) + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + + // 4. 连续性条件:维度 > 1 且 最内层在内存中是连续的 (stride=1) + bool is_inner_contiguous = (layout.ndim > 1) && + (layout.in_strides[layout.ndim-1] == 1) && + (layout.out_strides[layout.ndim-1] == 1); + + // 5. 步长对齐条件: 除非是最内层维度,否则所有 Stride 都必须是 PackSize 的倍数 + // 这样保证每个 Pack 读取的起始地址都是对齐的 + bool is_stride_aligned = true; + for (int i = 0; i < layout.ndim - 1; ++i) { + if (layout.in_strides[i] % PackSize != 0 || layout.out_strides[i] % PackSize != 0) { + is_stride_aligned = false; + break; + } + } + + bool can_vectorize = (PackSize > 1) && + is_ptr_aligned && + is_numel_divisible && + is_last_dim_aligned && + is_inner_contiguous && + is_stride_aligned; + + if (can_vectorize) { + size_t num_packs = numel / PackSize; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + op::flipud::moore::flipud_kernel_vectorized + <<>>(out_ptr, in_ptr, num_packs, layout); + } else { + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + op::flipud::moore::flipud_kernel + <<>>(out_ptr, in_ptr, numel, layout); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = FlipudInfo::create(out_desc, input_desc); + if (!info_result) return info_result.status(); + + auto opaque = new Opaque(); + opaque->layout.ndim = static_cast(input_desc->ndim()); + + if (opaque->layout.ndim > op::flipud::moore::MAX_DIMS) { + delete opaque; + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto& shape = input_desc->shape(); + const auto& in_strides = input_desc->strides(); + const auto& out_strides = out_desc->strides(); + + for (int i = 0; i < opaque->layout.ndim; ++i) { + opaque->layout.shape[i] = shape[i]; + opaque->layout.in_strides[i] = in_strides[i]; + opaque->layout.out_strides[i] = out_strides[i]; + } + + *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, void *stream) const { + + auto dtype = _info.dtype(); + auto numel = _info.numel(); + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::flipud::moore \ No newline at end of file diff --git a/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h b/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h new file mode 100644 index 000000000..8a5c65ef4 --- /dev/null +++ b/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h @@ -0,0 +1,97 @@ +#ifndef __FLIPUD_MOORE_KERNEL_H__ +#define __FLIPUD_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::flipud::moore { + +constexpr int MAX_DIMS = 8; + +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +struct TensorLayout { + int ndim; + size_t shape[MAX_DIMS]; + size_t in_strides[MAX_DIMS]; + size_t out_strides[MAX_DIMS]; +}; + +__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout& layout, size_t* coords) { + size_t temp = index; + #pragma unroll + for (int i = layout.ndim - 1; i >= 0; --i) { + coords[i] = temp % layout.shape[i]; + temp /= layout.shape[i]; + } +} + +__device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const size_t* strides, int ndim) { + size_t offset = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +template +__global__ void flipud_kernel( + T * __restrict__ output, + const T * __restrict__ input, + size_t numel, + TensorLayout layout) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numel) { + size_t coords[MAX_DIMS]; + index_to_coords(idx, layout, coords); + + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); + + // Flip dimension 0 + coords[0] = layout.shape[0] - 1 - coords[0]; + + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); + + output[out_offset] = input[in_offset]; + } +} + +template +__global__ void flipud_kernel_vectorized( + T * __restrict__ output, + const T * __restrict__ input, + size_t num_packs, + TensorLayout layout) { + + using PackType = Pack; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + size_t scalar_idx = idx * PackSize; + size_t coords[MAX_DIMS]; + + index_to_coords(scalar_idx, layout, coords); + + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); + + // Flip dimension 0 + coords[0] = layout.shape[0] - 1 - coords[0]; + + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); + + *reinterpret_cast(output + out_offset) = + *reinterpret_cast(input + in_offset); + } +} + +} // namespace op::flipud::moore + +#endif // __FLIPUD_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu new file mode 100644 index 000000000..71e8e7d4b --- /dev/null +++ b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu @@ -0,0 +1,155 @@ +#include "flipud_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include +#include + +namespace op::flipud::nvidia { + +// ================================================================== +// 辅助函数 +// ================================================================== +// [修改点 1] 去掉 template ,改为普通静态函数,避免解析错误 +// [修改点 2] 重命名为 is_pointer_aligned 避免潜在的命名冲突 +static inline bool is_pointer_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Opaque 定义:存储 Tensor Layout +// ================================================================== +// [关键] 必须在析构函数之前定义完整结构 +struct Descriptor::Opaque { + op::flipud::cuda::TensorLayout layout; +}; + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, const void *input, + op::flipud::cuda::TensorLayout layout, + size_t numel, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto cuda_stream = reinterpret_cast(stream); + + constexpr int TotalBytes = 16; // 128-bit + constexpr int PackSize = TotalBytes / sizeof(T); + + // ------------------------------------------ + // 向量化判定 (Vectorization Check) + // ------------------------------------------ + bool is_ptr_aligned = is_pointer_aligned(output, TotalBytes) && is_pointer_aligned(input, TotalBytes); + + + bool is_numel_divisible = (numel % PackSize == 0); + + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + + // 4. 连续性条件:维度 > 1 且 最内层连续 + bool is_inner_contiguous = (layout.ndim > 1) && + (layout.in_strides[layout.ndim-1] == 1) && + (layout.out_strides[layout.ndim-1] == 1); + + // 5. 步长对齐条件 + bool is_stride_aligned = true; + for (int i = 0; i < layout.ndim - 1; ++i) { + if (layout.in_strides[i] % PackSize != 0 || layout.out_strides[i] % PackSize != 0) { + is_stride_aligned = false; + break; + } + } + + bool can_vectorize = (PackSize > 1) && + is_ptr_aligned && + is_numel_divisible && + is_last_dim_aligned && + is_inner_contiguous && + is_stride_aligned; + + if (can_vectorize) { + size_t num_packs = numel / PackSize; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + op::flipud::cuda::flipud_kernel_vectorized + <<>>(out_ptr, in_ptr, num_packs, layout); + } else { + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + op::flipud::cuda::flipud_kernel + <<>>(out_ptr, in_ptr, numel, layout); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc) { + + auto info_result = FlipudInfo::create(out_desc, input_desc); + if (!info_result) return info_result.status(); + + auto opaque = new Opaque(); + opaque->layout.ndim = static_cast(input_desc->ndim()); + + if (opaque->layout.ndim > op::flipud::cuda::MAX_DIMS) { + delete opaque; + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + const auto& shape = input_desc->shape(); + const auto& in_strides = input_desc->strides(); + const auto& out_strides = out_desc->strides(); + + for (int i = 0; i < opaque->layout.ndim; ++i) { + opaque->layout.shape[i] = shape[i]; + opaque->layout.in_strides[i] = in_strides[i]; + opaque->layout.out_strides[i] = out_strides[i]; + } + + *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, void *stream) const { + + auto dtype = _info.dtype(); + auto numel = _info.numel(); + + // 显式 Switch-Case 分发 + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _opaque->layout, numel, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::flipud::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh new file mode 100644 index 000000000..2b5396112 --- /dev/null +++ b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __FLIPUD_NVIDIA_CUH__ +#define __FLIPUD_NVIDIA_CUH__ + +#include "../flipud.h" +DESCRIPTOR(nvidia) + +#endif // __FLIPUD_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/flipud/operator.cc b/src/infiniop/ops/flipud/operator.cc new file mode 100644 index 000000000..0d6359b7e --- /dev/null +++ b/src/infiniop/ops/flipud/operator.cc @@ -0,0 +1,176 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/flipud.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/flipud_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/flipud_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/flipud_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/flipud_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateFlipudDescriptor( + infiniopHandle_t handle, + infiniopFlipudDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::flipud::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopFlipud( + infiniopFlipudDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/float_power/cpu/float_power_cpu.cc b/src/infiniop/ops/float_power/cpu/float_power_cpu.cc new file mode 100644 index 000000000..32cfcb6ec --- /dev/null +++ b/src/infiniop/ops/float_power/cpu/float_power_cpu.cc @@ -0,0 +1,148 @@ +#include "float_power_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include "../../../../utils/custom_types.h" + +namespace op::float_power::cpu { + +Descriptor::~Descriptor() = default; + +// ================================================================== +// 创建描述符 +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + + auto handle = reinterpret_cast(handle_); + + // 创建 Info 对象进行校验 (Info 类已更新,支持混合精度和 Tensor 指数) + auto result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + nullptr, + result.take(), + 0, // CPU 不需要 workspace + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 核心计算逻辑 +// 模板参数: T_OUT (输出类型), T_IN (输入类型) +// ================================================================== +template +void calculate_cpu_impl( + const FloatPowerInfo &info, + void *output, + const void *input, + const void *exponent_ptr) { + + size_t numel = info.num_elements(); + + // 获取指数模式 + bool is_scalar = info.is_scalar_exponent(); + float scalar_exp = info.scalar_exponent(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto exp_ptr = reinterpret_cast(exponent_ptr); + + // 针对标量模式的简单优化标记 + bool is_square = is_scalar && (scalar_exp == 2.0f); + bool is_sqrt = is_scalar && (scalar_exp == 0.5f); + bool is_identity = is_scalar && (scalar_exp == 1.0f); + + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < numel; ++i) { + // 1. 读取输入并转为 float + float in_val = utils::cast(in_ptr[i]); + float exp_val; + + // 2. 获取指数值 + if (is_scalar) { + exp_val = scalar_exp; + } else { + // Tensor 模式:读取对应位置的指数并转为 float + exp_val = utils::cast(exp_ptr[i]); + } + + // 3. 计算结果 + float result_val; + if (is_scalar && is_identity) { + result_val = in_val; + } else if (is_scalar && is_square) { + result_val = in_val * in_val; + } else if (is_scalar && is_sqrt) { + result_val = std::sqrt(in_val); + } else { + // 通用幂运算 + result_val = std::pow(in_val, exp_val); + } + + // 4. 转回输出类型 T_OUT 并存储 + out_ptr[i] = utils::cast(result_val); + } +} + +// ================================================================== +// 分发逻辑 +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *exponent, + void *stream) const { + + auto in_dtype = _info.input_dtype(); + auto out_dtype = _info.output_dtype(); + + // 定义内层宏:根据 Output 类型分发 + #define DISPATCH_OUT(IN_T) \ + switch (out_dtype) { \ + case INFINI_DTYPE_F32: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_F64: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_F16: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_BF16: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + // 外层 Switch:根据 Input 类型分发 + switch (in_dtype) { + case INFINI_DTYPE_F32: + DISPATCH_OUT(float); + case INFINI_DTYPE_F64: + DISPATCH_OUT(double); + case INFINI_DTYPE_F16: + DISPATCH_OUT(fp16_t); + case INFINI_DTYPE_BF16: + DISPATCH_OUT(bf16_t); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + #undef DISPATCH_OUT +} + +} // namespace op::float_power::cpu \ No newline at end of file diff --git a/src/infiniop/ops/float_power/cpu/float_power_cpu.h b/src/infiniop/ops/float_power/cpu/float_power_cpu.h new file mode 100644 index 000000000..3f97c2726 --- /dev/null +++ b/src/infiniop/ops/float_power/cpu/float_power_cpu.h @@ -0,0 +1,7 @@ +#ifndef __FLOAT_POWER_CPU_H__ +#define __FLOAT_POWER_CPU_H__ + +#include "../float_power.h" +DESCRIPTOR(cpu) + +#endif // __FLOAT_POWER_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/cuda/kernel.cuh b/src/infiniop/ops/float_power/cuda/kernel.cuh new file mode 100644 index 000000000..af07406ed --- /dev/null +++ b/src/infiniop/ops/float_power/cuda/kernel.cuh @@ -0,0 +1,107 @@ +#ifndef __FLOAT_POWER_CUDA_CUH__ +#define __FLOAT_POWER_CUDA_CUH__ +#include +#include +#include +#include + +namespace op::float_power::cuda { + +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// Functor: 仅负责核心数学计算逻辑 +// ================================================================== +struct FloatPowerFunctor { + template + __device__ __forceinline__ float compute(const T_IN &input, float exponent_val) const { + // 将输入转为 float 参与计算,以保证计算精度和统一性 + float in_f = static_cast(input); + return powf(in_f, exponent_val); + } +}; +template +__global__ void float_power_kernel( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_EXP * __restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, + FloatPowerFunctor functor) { + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; + idx += blockDim.x * gridDim.x) { + + float exp_val_f = is_scalar ? scalar_exponent : static_cast(exponent[idx]); + output[idx] = static_cast(functor.compute(input[idx], exp_val_f)); + } +} + +// ================================================================== +// 2. 标量模式向量化 Kernel +// ================================================================== +template +__global__ void float_power_kernel_vectorized_scalar( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + float scalar_exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], scalar_exponent)); + } + out_vec[idx] = out_pack; + } +} +template +__global__ void float_power_kernel_vectorized_tensor( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_IN * __restrict__ exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + float e = static_cast(exp_pack.val[i]); + out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], e)); + } + out_vec[idx] = out_pack; + } +} + +} // namespace op::float_power::cuda + +#endif // __FLOAT_POWER_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/float_power.h b/src/infiniop/ops/float_power/float_power.h new file mode 100644 index 000000000..bf61ac36d --- /dev/null +++ b/src/infiniop/ops/float_power/float_power.h @@ -0,0 +1,52 @@ +#ifndef __FLOAT_POWER_H__ +#define __FLOAT_POWER_H__ + +#include "../../operator.h" +#include "info.h" + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::float_power::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + FloatPowerInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + FloatPowerInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + /* [修改] 增加 exponent 张量描述符 和 scalar_exponent */ \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y, \ + infiniopTensorDescriptor_t x, \ + infiniopTensorDescriptor_t exponent, \ + float scalar_exponent); \ + \ + /* [修改] 增加 exponent 数据指针 */ \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *exponent, \ + void *stream) const; \ + }; \ + } + +#endif // __FLOAT_POWER_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/info.h b/src/infiniop/ops/float_power/info.h new file mode 100644 index 000000000..46252b5dd --- /dev/null +++ b/src/infiniop/ops/float_power/info.h @@ -0,0 +1,83 @@ +#ifndef __FLOAT_POWER_INFO_H__ +#define __FLOAT_POWER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::float_power { + +class FloatPowerInfo { + FloatPowerInfo() = default; + +public: + int _input_dtype; // 输入数据类型 + int _output_dtype; // 输出数据类型 + + bool _is_scalar_exponent;// 是否为标量指数 + float _scalar_exponent; // 标量指数的值 (仅当 _is_scalar_exponent 为 true 时有效) + + size_t _num_elements; // 元素总数 + + // Getters + int input_dtype() const { return _input_dtype; } + int output_dtype() const { return _output_dtype; } + bool is_scalar_exponent() const { return _is_scalar_exponent; } + float scalar_exponent() const { return _scalar_exponent; } + size_t num_elements() const { return _num_elements; } + + // 构造函数 + FloatPowerInfo(int in_dtype, int out_dtype, bool is_scalar, float scalar_exp, size_t numel) + : _input_dtype(in_dtype), _output_dtype(out_dtype), + _is_scalar_exponent(is_scalar), _scalar_exponent(scalar_exp), + _num_elements(numel) {} + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t exponent_desc, + float scalar_exponent) { + if (out_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 使用引用接收 vector,避免之前的编译错误 + const auto& in_shape = input_desc->shape(); + const auto& out_shape = out_desc->shape(); + size_t count = 1; + + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (in_shape[i] != out_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + count *= in_shape[i]; + } + + // 3. 判断是标量模式还是张量模式 + bool is_scalar = (exponent_desc == nullptr); + + if (!is_scalar) { + if (exponent_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + const auto& exp_shape = exponent_desc->shape(); + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (exp_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + } + + // 构造 Info 对象 + return utils::Result(FloatPowerInfo{ + input_desc->dtype(), // Input Dtype + out_desc->dtype(), // Output Dtype (分开存储) + is_scalar, // Mode flag + scalar_exponent, // Scalar Value + count // Total elements + }); + } +}; + +} // namespace op::float_power + +#endif // __FLOAT_POWER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/metax/float_power_metax.h b/src/infiniop/ops/float_power/metax/float_power_metax.h new file mode 100644 index 000000000..dd8d08f54 --- /dev/null +++ b/src/infiniop/ops/float_power/metax/float_power_metax.h @@ -0,0 +1,8 @@ +#ifndef __FLOAT_POWER_METAX_API_H__ +#define __FLOAT_POWER_METAX_API_H__ + +#include "../float_power.h" + +DESCRIPTOR(metax) + +#endif // __FLOAT_POWER_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/metax/float_power_metax.maca b/src/infiniop/ops/float_power/metax/float_power_metax.maca new file mode 100644 index 000000000..14c7e65dc --- /dev/null +++ b/src/infiniop/ops/float_power/metax/float_power_metax.maca @@ -0,0 +1,309 @@ +#include "float_power_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; + + +namespace op::float_power::metax { + +// 基础定义: 向量化数据打包结构 +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// Functor: 仅负责核心数学计算逻辑 +struct FloatPowerFunctor { + template + __device__ __forceinline__ float compute(const T_IN &input, float exponent_val) const { + // 将输入转为 float 参与计算 + float in_f = static_cast(input); + return powf(in_f, exponent_val); + } +}; + +// Kernel 1: 通用处理 (Grid-Stride Loop) +template +__global__ void float_power_kernel( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_EXP * __restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, + FloatPowerFunctor functor) { + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; + idx += blockDim.x * gridDim.x) { + + float exp_val_f = is_scalar ? scalar_exponent : static_cast(exponent[idx]); + output[idx] = static_cast(functor.compute(input[idx], exp_val_f)); + } +} + +// Kernel 2: 标量模式向量化 Kernel +template +__global__ void float_power_kernel_vectorized_scalar( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + float scalar_exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], scalar_exponent)); + } + out_vec[idx] = out_pack; + } +} + +// Kernel 3: 张量模式向量化 Kernel +template +__global__ void float_power_kernel_vectorized_tensor( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_IN * __restrict__ exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + float e = static_cast(exp_pack.val[i]); + out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], e)); + } + out_vec[idx] = out_pack; + } +} + +// ================================================================== +// 3. 辅助函数与 Launcher +// ================================================================== + +// 辅助函数: 检查内存地址对齐情况 +template +static inline bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// Launcher Implementation +template +void launch_kernel( + void *output, + const void *input, + const void *exponent, + const FloatPowerInfo &info, + void *stream) { + + size_t numel = info.num_elements(); + bool is_scalar = info.is_scalar_exponent(); + float scalar_exp = info.scalar_exponent(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + // 假设指数 Tensor 的数据类型与输入 Tensor 一致 + auto exp_ptr = reinterpret_cast(exponent); + + auto mc_stream = reinterpret_cast(stream); + FloatPowerFunctor functor; + + // ------------------------------------------------------------------ + // 向量化分发路径 + // ------------------------------------------------------------------ + constexpr int AlignBytes = 16; + constexpr int PackSizeIn = AlignBytes / sizeof(T_IN); + + // 检查输入输出类型大小是否一致 + bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); + + bool can_vectorize_base = types_same_size && + (PackSizeIn > 1) && + (numel % PackSizeIn == 0) && + is_aligned(input, AlignBytes) && + is_aligned(output, AlignBytes); + + if (can_vectorize_base) { + size_t num_packs = numel / PackSizeIn; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + if (is_scalar) { + // 路径 A1: 标量指数向量化 + float_power_kernel_vectorized_scalar + <<>>( + out_ptr, in_ptr, scalar_exp, num_packs, functor + ); + return; + } else if (is_aligned(exponent, AlignBytes)) { + // 路径 A2: 张量指数向量化 + float_power_kernel_vectorized_tensor + <<>>( + out_ptr, in_ptr, exp_ptr, num_packs, functor + ); + return; + } + } + + // ------------------------------------------------------------------ + // 通用回退路径 + // ------------------------------------------------------------------ + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + float_power_kernel + <<>>( + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor + ); +} + +// ================================================================== +// 4. Descriptor 接口实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + + auto handle = reinterpret_cast(handle_); + auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); + if (!info_result) return info_result.status(); + + size_t workspace_size = 0; + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, const void *exponent, + void *stream) const { + + auto in_dtype = _info.input_dtype(); + auto out_dtype = _info.output_dtype(); + + // ================================================================== + // 显式双重分发 (注意: half 和 nv_bfloat16 已在上方适配) + // ================================================================== + + switch (in_dtype) { + + case INFINI_DTYPE_F32: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel<__half, float>(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F64: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel<__half, double>(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel<__half, __half>(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_BF16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel<__half, nv_bfloat16>(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::float_power::metax \ No newline at end of file diff --git a/src/infiniop/ops/float_power/moore/float_power_moore.h b/src/infiniop/ops/float_power/moore/float_power_moore.h new file mode 100644 index 000000000..4f959fdf0 --- /dev/null +++ b/src/infiniop/ops/float_power/moore/float_power_moore.h @@ -0,0 +1,8 @@ +#ifndef __FLOAT_POWER_MOORE_H__ +#define __FLOAT_POWER_MOORE_H__ + +#include "../float_power.h" + +DESCRIPTOR(moore) + +#endif // __FLOAT_POWER_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/moore/float_power_moore.mu b/src/infiniop/ops/float_power/moore/float_power_moore.mu new file mode 100644 index 000000000..721820018 --- /dev/null +++ b/src/infiniop/ops/float_power/moore/float_power_moore.mu @@ -0,0 +1,204 @@ +#include "float_power_moore.h" +#include "float_power_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include + +namespace op::float_power::moore { + +// ================================================================== +// 辅助函数: 检查内存地址对齐情况 +// ================================================================== +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *exponent, + const FloatPowerInfo &info, + void *stream) { + + size_t numel = info.num_elements(); + bool is_scalar = info.is_scalar_exponent(); + float scalar_exp = info.scalar_exponent(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto exp_ptr = reinterpret_cast(exponent); + + auto musa_stream = reinterpret_cast(stream); + op::float_power::moore::FloatPowerFunctor functor; + + // ------------------------------------------------------------------ + // 1. 向量化分发路径 (Vectorized Path) + // ------------------------------------------------------------------ + constexpr int AlignBytes = 16; + constexpr int PackSizeIn = AlignBytes / sizeof(T_IN); + + // 只有当输入和输出类型大小相同时,当前的 1:1 Pack 向量化逻辑才生效 + bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); + + bool can_vectorize_base = types_same_size && + (PackSizeIn > 1) && + (numel % PackSizeIn == 0) && + is_aligned(input, AlignBytes) && + is_aligned(output, AlignBytes); + + if (can_vectorize_base) { + size_t num_packs = numel / PackSizeIn; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + if (is_scalar) { + // 路径 A1: 标量指数向量化 + op::float_power::moore::float_power_kernel_vectorized_scalar + <<>>( + out_ptr, in_ptr, scalar_exp, num_packs, functor + ); + return; + } else if (is_aligned(exponent, AlignBytes)) { + // 路径 A2: 张量指数向量化 + op::float_power::moore::float_power_kernel_vectorized_tensor + <<>>( + out_ptr, in_ptr, exp_ptr, num_packs, functor + ); + return; + } + } + + // ------------------------------------------------------------------ + // 2. 通用回退路径 (Fallback Path) + // ------------------------------------------------------------------ + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + op::float_power::moore::float_power_kernel + <<>>( + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor + ); +} + +// ================================================================== +// Descriptor 接口实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); + if (!info_result) return info_result.status(); + + size_t workspace_size = 0; + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, const void *exponent, + void *stream) const { + + auto in_dtype = _info.input_dtype(); + auto out_dtype = _info.output_dtype(); + + switch (in_dtype) { + + case INFINI_DTYPE_F32: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16, float>(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F64: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16, double>(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16, half>(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_BF16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16, __mt_bfloat16>(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::float_power::moore \ No newline at end of file diff --git a/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h b/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h new file mode 100644 index 000000000..d2ea6f33f --- /dev/null +++ b/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h @@ -0,0 +1,147 @@ +#ifndef __FLOAT_POWER_MOORE_KERNEL_H__ +#define __FLOAT_POWER_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include + +namespace op::float_power::moore { + +// ================================================================== +// 类型转换辅助函数 (适配 MUSA) +// ================================================================== +template +__device__ __forceinline__ float to_float(T val) { + if constexpr (std::is_same_v) { + return __half2float(val); + } else if constexpr (std::is_same_v) { + return __bfloat162float(val); + } else { + return static_cast(val); + } +} + +template +__device__ __forceinline__ T from_float(float val) { + if constexpr (std::is_same_v) { + return __float2half(val); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(val); + } else { + return static_cast(val); + } +} + +// ================================================================== +// 基础定义: 向量化数据打包结构 +// ================================================================== +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// Functor: 仅负责核心数学计算逻辑 +// ================================================================== +struct FloatPowerFunctor { + template + __device__ __forceinline__ float compute(const T_IN &input, float exponent_val) const { + // 使用 to_float 辅助函数处理 FP16/BF16 + float in_f = to_float(input); + return powf(in_f, exponent_val); + } +}; + +// ================================================================== +// 1. 通用处理 Kernel (Grid-Stride Loop) +// ================================================================== +template +__global__ void float_power_kernel( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_EXP * __restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, + FloatPowerFunctor functor) { + + // Grid-Stride Loop + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; + idx += blockDim.x * gridDim.x) { + + float exp_val_f = is_scalar ? scalar_exponent : to_float(exponent[idx]); + output[idx] = from_float(functor.compute(input[idx], exp_val_f)); + } +} + +// ================================================================== +// 2. 标量模式向量化 Kernel +// ================================================================== +template +__global__ void float_power_kernel_vectorized_scalar( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + float scalar_exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + out_pack.val[i] = from_float(functor.compute(in_pack.val[i], scalar_exponent)); + } + out_vec[idx] = out_pack; + } +} + +// ================================================================== +// 3. 张量模式向量化 Kernel +// ================================================================== +template +__global__ void float_power_kernel_vectorized_tensor( + T_OUT * __restrict__ output, + const T_IN * __restrict__ input, + const T_IN * __restrict__ exponent, + size_t num_packs, + FloatPowerFunctor functor) { + + using PackTypeIn = Pack; + using PackTypeOut = Pack; + + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_packs) { + PackTypeIn in_pack = in_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; + PackTypeOut out_pack; + + #pragma unroll + for (int i = 0; i < PackSize; ++i) { + float e = to_float(exp_pack.val[i]); + out_pack.val[i] = from_float(functor.compute(in_pack.val[i], e)); + } + out_vec[idx] = out_pack; + } +} + +} // namespace op::float_power::moore + +#endif // __FLOAT_POWER_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu new file mode 100644 index 000000000..24e57508a --- /dev/null +++ b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu @@ -0,0 +1,202 @@ +#include "float_power_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::float_power::nvidia { + +// ================================================================== +// 辅助函数: 检查内存地址对齐情况 +// ================================================================== +template +bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} +template +void launch_kernel( + void *output, + const void *input, + const void *exponent, + const FloatPowerInfo &info, + void *stream) { + + size_t numel = info.num_elements(); + bool is_scalar = info.is_scalar_exponent(); + float scalar_exp = info.scalar_exponent(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + // 假设指数 Tensor 的数据类型与输入 Tensor 一致 + auto exp_ptr = reinterpret_cast(exponent); + + auto cuda_stream = reinterpret_cast(stream); + op::float_power::cuda::FloatPowerFunctor functor; + + // ------------------------------------------------------------------ + // 1. 向量化分发路径 (Vectorized Path) + // ------------------------------------------------------------------ + constexpr int AlignBytes = 16; // 16字节对齐是 CUDA 访存优化的标准 + constexpr int PackSizeIn = AlignBytes / sizeof(T_IN); + bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); + + bool can_vectorize_base = types_same_size && + (PackSizeIn > 1) && + (numel % PackSizeIn == 0) && + is_aligned(input, AlignBytes) && + is_aligned(output, AlignBytes); + + if (can_vectorize_base) { + size_t num_packs = numel / PackSizeIn; + size_t block_size = 256; + size_t grid_size = (num_packs + block_size - 1) / block_size; + + if (is_scalar) { + // 路径 A1: 标量指数向量化(极快) + op::float_power::cuda::float_power_kernel_vectorized_scalar + <<>>( + out_ptr, in_ptr, scalar_exp, num_packs, functor + ); + return; + } else if (is_aligned(exponent, AlignBytes)) { + // 路径 A2: 张量指数向量化(解决 0.2x 倍速问题的核心) + op::float_power::cuda::float_power_kernel_vectorized_tensor + <<>>( + out_ptr, in_ptr, exp_ptr, num_packs, functor + ); + return; + } + } + + // ------------------------------------------------------------------ + // 2. 通用回退路径 (Fallback Path) + // 处理不对齐、非对称类型转换或小规模数据的场景 + // ------------------------------------------------------------------ + size_t block_size = 256; + size_t grid_size = (numel + block_size - 1) / block_size; + + op::float_power::cuda::float_power_kernel + <<>>( + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor + ); +} + +// ================================================================== +// Descriptor 接口实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { if (_opaque) delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + + auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); + if (!info_result) return info_result.status(); + + size_t workspace_size = 0; + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *output, + const void *input, const void *exponent, + void *stream) const { + + auto in_dtype = _info.input_dtype(); + auto out_dtype = _info.output_dtype(); + + // ================================================================== + // 完全显式双重分发 (Fully Explicit Double Dispatch) + // ================================================================== + + switch (in_dtype) { + + case INFINI_DTYPE_F32: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F64: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_F16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + case INFINI_DTYPE_BF16: + switch (out_dtype) { + case INFINI_DTYPE_F32: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_F16: + launch_kernel(output, input, exponent, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, exponent, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::float_power::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh new file mode 100644 index 000000000..cb170b339 --- /dev/null +++ b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __FLOAT_POWER_NVIDIA_CUH__ +#define __FLOAT_POWER_NVIDIA_CUH__ + +#include "../float_power.h" +DESCRIPTOR(nvidia) + +#endif // __FLOAT_POWER_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/float_power/operator.cc b/src/infiniop/ops/float_power/operator.cc new file mode 100644 index 000000000..428ecf0e5 --- /dev/null +++ b/src/infiniop/ops/float_power/operator.cc @@ -0,0 +1,180 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/float_power.h" + +#ifdef ENABLE_CPU_API +#include "cpu/float_power_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/float_power_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/float_power_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/float_power_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateFloatPowerDescriptor( + infiniopHandle_t handle, + infiniopFloatPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::float_power::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + exponent, \ + scalar_exponent) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopFloatPower( + infiniopFloatPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *exponent, // [新增参数] + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, exponent, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc new file mode 100644 index 000000000..147221a77 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc @@ -0,0 +1,58 @@ +#include "floor_divide_cpu.h" + +namespace op::floor_divide::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor_divide::cpu \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h new file mode 100644 index 000000000..ec5fcfac1 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h @@ -0,0 +1,30 @@ +#ifndef __FLOOR_DIVIDE_CPU_H__ +#define __FLOOR_DIVIDE_CPU_H__ + +#include +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(floor_divide, cpu) + +namespace op::floor_divide::cpu { +typedef struct FloorDivideOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_floating_point_v) { + return std::floor(a / b); + } else { + T res = a / b; + T rem = a % b; + if (rem != 0 && ((a < 0) ^ (b < 0))) { + res -= 1; + } + return res; + } + } +} FloorDivideOp; +} // namespace op::floor_divide::cpu + +#endif // __FLOOR_DIVIDE_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/cuda/kernel.cuh b/src/infiniop/ops/floor_divide/cuda/kernel.cuh new file mode 100644 index 000000000..9f77280f1 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cuda/kernel.cuh @@ -0,0 +1,36 @@ +#ifndef __FLOOR_DIVIDE_CUDA_H__ +#define __FLOOR_DIVIDE_CUDA_H__ + +#include +#include + +namespace op::floor_divide::cuda { +typedef struct FloorDivideOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return h2floor(__h2div(a, b)); + } else if constexpr (std::is_same_v) { + return hfloor(__hdiv(a, b)); + } else if constexpr (std::is_same_v) { + float val = __bfloat162float(a) / __bfloat162float(b); + return __float2bfloat16(floorf(val)); + } else if constexpr (std::is_same_v) { + return floorf(a / b); + } else if constexpr (std::is_same_v) { + return floor(a / b); + } else { + T res = a / b; + T rem = a % b; + if (rem != 0 && ((a < 0) ^ (b < 0))) { + res -= 1; + } + return res; + } + } +} FloorDivideOp; +} // namespace op::floor_divide::cuda + +#endif // __FLOOR_DIVIDE_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h new file mode 100644 index 000000000..d77b7af90 --- /dev/null +++ b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h @@ -0,0 +1,8 @@ +#ifndef __FLOOR_DIVIDE_METAX_API_H__ +#define __FLOOR_DIVIDE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(floor_divide, metax) + +#endif // __FLOOR_DIVIDE_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca new file mode 100644 index 000000000..078b63690 --- /dev/null +++ b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca @@ -0,0 +1,124 @@ +#include "floor_divide_metax.h" +#include "../../../elementwise/metax/elementwise_metax.h" +#include +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; + +namespace op::floor_divide::metax { + +struct FloorDivideOp { + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + // ------------------------------------------------ + // 1. Half2 向量化 + // ------------------------------------------------ + if constexpr (std::is_same_v) { + // MACA: 转为 float2 处理 + float2 fa = __half22float2(a); + float2 fb = __half22float2(b); + float2 res; + res.x = floorf(fa.x / fb.x); + res.y = floorf(fa.y / fb.y); + return __float22half2_rn(res); + } + // ------------------------------------------------ + // 2. Half 标量 + // ------------------------------------------------ + else if constexpr (std::is_same_v) { + return __float2half(floorf(__half2float(a) / __half2float(b))); + } + // ------------------------------------------------ + // 3. BFloat16 + // ------------------------------------------------ + else if constexpr (std::is_same_v) { + float val = __bfloat162float(a) / __bfloat162float(b); + return __float2bfloat16(floorf(val)); + } + // ------------------------------------------------ + // 4. Float / Double + // ------------------------------------------------ + else if constexpr (std::is_same_v) { + return floorf(a / b); + } else if constexpr (std::is_same_v) { + return floor(a / b); + } + // ------------------------------------------------ + // 5. 整数类型 (Int32 / Int64) + // ------------------------------------------------ + else { + // Python 语义: 向负无穷取整 + T res = a / b; + T rem = a % b; + if (rem != 0 && ((a < 0) ^ (b < 0))) { + res -= 1; + } + return res; + } + } +}; + +// ================================================================== +// 3. Descriptor 实现 +// ================================================================== +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create Metax elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, FloorDivideOp, __half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, FloorDivideOp, nv_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, FloorDivideOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, FloorDivideOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, FloorDivideOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, FloorDivideOp, int64_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor_divide::metax \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h new file mode 100644 index 000000000..e14c09e2e --- /dev/null +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h @@ -0,0 +1,8 @@ +#ifndef __FLOOR_DIVIDE_MOORE_API_H__ +#define __FLOOR_DIVIDE_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(floor_divide, moore) + +#endif // __FLOOR_DIVIDE_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu new file mode 100644 index 000000000..f5fce2b6f --- /dev/null +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu @@ -0,0 +1,69 @@ +#include "../../../elementwise/moore/elementwise_moore.h" +#include "floor_divide_moore.h" +#include "floor_divide_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" + +namespace op::floor_divide::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // 检查支持的数据类型 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + // 检查 Shape 是否一致 (Moore Elementwise 框架通常也要求 Strict Shape 或由框架处理广播) + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // 创建 Moore Elementwise Descriptor + // 假设存在对应的宏 CREATE_ELEMENTWISE_MOORE_DESCRIPTOR 用于初始化 _device_info + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, FloorDivideOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + // Moore 架构通常使用 __mt_bfloat16 + return _device_info->calculate<256, FloorDivideOp, __mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, FloorDivideOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, FloorDivideOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, FloorDivideOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, FloorDivideOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::floor_divide::moore \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h b/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h new file mode 100644 index 000000000..c911cbdfb --- /dev/null +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h @@ -0,0 +1,39 @@ +#ifndef __FLOOR_DIVIDE_MOORE_H__ +#define __FLOOR_DIVIDE_MOORE_H__ + +#include +#include +#include +#include + +namespace op::floor_divide::moore { +typedef struct FloorDivideOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return h2floor(__h2div(a, b)); + } else if constexpr (std::is_same_v) { + return hfloor(__hdiv(a, b)); + } else if constexpr (std::is_same_v) { + float val = __bfloat162float(a) / __bfloat162float(b); + return __float2bfloat16(floorf(val)); + } else if constexpr (std::is_same_v) { + return floorf(a / b); + } else if constexpr (std::is_same_v) { + return floor(a / b); + } else { + // Integer types + T res = a / b; + T rem = a % b; + if (rem != 0 && ((a < 0) ^ (b < 0))) { + res -= 1; + } + return res; + } + } +} FloorDivideOp; +} // namespace op::floor_divide::moore + +#endif // __FLOOR_DIVIDE_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu new file mode 100644 index 000000000..830fe3b05 --- /dev/null +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu @@ -0,0 +1,65 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "floor_divide_nvidia.cuh" + +namespace op::floor_divide::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FloorDivideOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FloorDivideOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FloorDivideOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FloorDivideOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::FloorDivideOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FloorDivideOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor_divide::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh new file mode 100644 index 000000000..684c6d189 --- /dev/null +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FLOOR_DIVIDE_CUDA_API_H__ +#define __FLOOR_DIVIDE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(floor_divide, nvidia) + +#endif // __FLOOR_DIVIDE_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/floor_divide/operator.cc b/src/infiniop/ops/floor_divide/operator.cc new file mode 100644 index 000000000..320af088f --- /dev/null +++ b/src/infiniop/ops/floor_divide/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/floor_divide.h" + +#ifdef ENABLE_CPU_API +#include "cpu/floor_divide_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/floor_divide_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/floor_divide_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/floor_divide_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/floor_divide_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/floor_divide_moore.h" +#endif + +__C infiniStatus_t infiniopCreateFloorDivideDescriptor( + infiniopHandle_t handle, + infiniopFloorDivideDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::floor_divide::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFloorDivide( + infiniopFloorDivideDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc new file mode 100644 index 000000000..4e3f6d4d6 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc @@ -0,0 +1,175 @@ +#include "multi_margin_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::multi_margin_loss::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + int p, + float margin, + int reduction) { + + auto handle = reinterpret_cast(handle_); + + auto result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const MultiMarginLossInfo &info, + void *output, + const void *input, + const void *target, + const void *weight) { + + size_t N = info.batch_size(); + size_t C = info.num_classes(); + int p = info.p(); + float margin = info.margin(); + int reduction = info.reduction(); + bool has_weight = info.has_weight(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + auto tar_ptr = reinterpret_cast(target); + auto weight_ptr = reinterpret_cast(weight); + + if (reduction == 0) { + #pragma omp parallel for schedule(static) + for (size_t n = 0; n < N; ++n) { + int64_t target_idx = tar_ptr[n]; + + if (target_idx < 0 || target_idx >= static_cast(C)) { + out_ptr[n] = utils::cast(0.0f); + continue; + } + + const T* row_ptr = in_ptr + n * C; + float target_score = utils::cast(row_ptr[target_idx]); + float sum_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = utils::cast(row_ptr[c]); + float diff = margin - target_score + other_score; + + if (diff > 0.0f) { + sum_loss += (p == 1) ? diff : (diff * diff); + } + } + + sum_loss /= static_cast(C); + + if (has_weight) { + float w = utils::cast(weight_ptr[target_idx]); + sum_loss *= w; + } + + out_ptr[n] = utils::cast(sum_loss); + } + } else { + double total_loss = 0.0; + + #pragma omp parallel for reduction(+:total_loss) schedule(static) + for (size_t n = 0; n < N; ++n) { + int64_t target_idx = tar_ptr[n]; + + if (target_idx < 0 || target_idx >= static_cast(C)) continue; + + const T* row_ptr = in_ptr + n * C; + float target_score = utils::cast(row_ptr[target_idx]); + float sum_sample_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = utils::cast(row_ptr[c]); + float diff = margin - target_score + other_score; + + if (diff > 0.0f) { + sum_sample_loss += (p == 1) ? diff : (diff * diff); + } + } + + sum_sample_loss /= static_cast(C); + + if (has_weight) { + float w = utils::cast(weight_ptr[target_idx]); + sum_sample_loss *= w; + } + + total_loss += static_cast(sum_sample_loss); + } + + if (reduction == 1) { + total_loss /= static_cast(N); + } + + out_ptr[0] = utils::cast(static_cast(total_loss)); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream) const { + + auto dtype = _info.dtype(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, output, input, target, weight); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, output, input, target, weight); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, output, input, target, weight); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, output, input, target, weight); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::multi_margin_loss::cpu \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h new file mode 100644 index 000000000..39098ff7d --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h @@ -0,0 +1,8 @@ +#ifndef __MULTI_MARGIN_LOSS_CPU_H__ +#define __MULTI_MARGIN_LOSS_CPU_H__ + +#include "../multi_margin_loss.h" + +DESCRIPTOR(cpu) + +#endif // __MULTI_MARGIN_LOSS_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh b/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh new file mode 100644 index 000000000..7261a99a4 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh @@ -0,0 +1,166 @@ +#ifndef __MULTI_MARGIN_LOSS_CUDA_CUH__ +#define __MULTI_MARGIN_LOSS_CUDA_CUH__ + +#include +#include +#include + +#include +#include + +namespace op::multi_margin_loss::cuda { +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 归约辅助函数 (Warp & Block Reduction) +// ================================================================== +__device__ __forceinline__ float warpReduceSum(float val) { + unsigned int mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + return val; +} + +__device__ __forceinline__ float blockReduceSum(float val) { + static __shared__ float shared[32]; // Max 1024 threads / 32 warps + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) shared[wid] = val; + __syncthreads(); + + // 假设 BlockDim 也是 32 的倍数 + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; + if (wid == 0) val = warpReduceSum(val); + return val; +} + +// ================================================================== +// Functor: 核心数学逻辑 +// ================================================================== +struct MultiMarginLossFunctor { + int p; + float margin; + + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + : p(p_val), margin(margin_val) {} + + // 计算单个 class c 的 loss 分量 + // diff = margin - target_score + other_score + __device__ __forceinline__ float compute(float diff) const { + if (diff > 0.0f) { + return (p == 1) ? diff : diff * diff; + } + return 0.0f; + } +}; +template +__global__ void multi_margin_loss_kernel( + T * __restrict__ output, // [N] + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] (Optional) + size_t N, + size_t C, + MultiMarginLossFunctor functor) { + + size_t n = blockIdx.x * blockDim.x + threadIdx.x; + + if (n < N) { + int64_t target_idx = target[n]; + + // 越界检查 + if (target_idx < 0 || target_idx >= static_cast(C)) { + output[n] = static_cast(0.0f); + return; + } + + // 定位当前行的起始位置 + const T* row_ptr = input + n * C; + float target_score = static_cast(row_ptr[target_idx]); + float sum_loss = 0.0f; + + // 遍历所有类别 + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = static_cast(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sum_loss += functor.compute(diff); + } + + // 公式: sum / C + sum_loss /= static_cast(C); + + // 应用权重 + if (weight != nullptr) { + float w = static_cast(weight[target_idx]); + sum_loss *= w; + } + + output[n] = static_cast(sum_loss); + } +} +template +__global__ void multi_margin_loss_reduce_kernel( + float * output, // [1] Accumulator (Float) + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] + size_t N, + size_t C, + MultiMarginLossFunctor functor, + float scale // Mean模式传 1/N, Sum模式传 1.0 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // Grid-Stride Loop over Batch Dimension N + for (size_t n = idx; n < N; n += stride) { + int64_t target_idx = target[n]; + + if (target_idx >= 0 && target_idx < static_cast(C)) { + const T* row_ptr = input + n * C; + float target_score = static_cast(row_ptr[target_idx]); + float sample_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = static_cast(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sample_loss += functor.compute(diff); + } + + sample_loss /= static_cast(C); + + if (weight != nullptr) { + float w = static_cast(weight[target_idx]); + sample_loss *= w; + } + + local_sum += sample_loss; + } + } + + // Block Reduction + float block_sum = blockReduceSum(local_sum); + + // Global Atomic Add (Reduce to scalar) + if (threadIdx.x == 0) { + atomicAdd(output, block_sum * scale); + } +} +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = static_cast(*src); +} + +} // namespace op::multi_margin_loss::cuda + +#endif // __MULTI_MARGIN_LOSS_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/info.h b/src/infiniop/ops/multi_margin_loss/info.h new file mode 100644 index 000000000..d460639c1 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/info.h @@ -0,0 +1,104 @@ +#ifndef __MULTI_MARGIN_LOSS_INFO_H__ +#define __MULTI_MARGIN_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::multi_margin_loss { + +class MultiMarginLossInfo { + MultiMarginLossInfo() = default; + +public: + int _dtype; // 输入/权重/输出的数据类型 + int _p; // 范数次数 (1 或 2) + float _margin; // 边界值 + int _reduction; // 规约模式 (0:None, 1:Mean, 2:Sum) + bool _has_weight; // 是否存在权重张量 + size_t _batch_size; // N + size_t _num_classes; // C + + int dtype() const { return _dtype; } + int p() const { return _p; } + float margin() const { return _margin; } + int reduction() const { return _reduction; } + bool has_weight() const { return _has_weight; } + size_t batch_size() const { return _batch_size; } + size_t num_classes() const { return _num_classes; } + + // 构造函数 + MultiMarginLossInfo(int dtype, int p, float margin, int reduction, bool has_weight, size_t batch, size_t classes) + : _dtype(dtype), _p(p), _margin(margin), _reduction(reduction), + _has_weight(has_weight), _batch_size(batch), _num_classes(classes) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, // 可为 nullptr + int p, + float margin, + int reduction) { + + // 1. 检查输入形状 (Input vs Target) + // Input: (N, C), Target: (N) + if (input_desc->ndim() != 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (target_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t N = input_desc->shape()[0]; + size_t C = input_desc->shape()[1]; + + if (target_desc->shape()[0] != N) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (target_desc->dtype() != INFINI_DTYPE_I64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + // Output 和 Input 类型必须一致 + if (out_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + bool has_weight = (weight_desc != nullptr); + if (has_weight) { + // Weight: (C) + if (weight_desc->ndim() != 1 || weight_desc->shape()[0] != C) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + // Weight 类型必须与 Input 一致 + if (weight_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + if (reduction == 0) { + if (out_desc->ndim() != 1 || out_desc->shape()[0] != N) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } else { + // Reduction::Mean/Sum -> 输出必须是标量 + if (out_desc->numel() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + if (p != 1 && p != 2) { + return INFINI_STATUS_BAD_PARAM; + } + return utils::Result(MultiMarginLossInfo{ + input_desc->dtype(), // _dtype + p, // _p + margin, // _margin + reduction, // _reduction + has_weight, // _has_weight + N, // _batch_size + C // _num_classes + }); + } +}; + +} // namespace op::multi_margin_loss + +#endif // __MULTI_MARGIN_LOSS_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h new file mode 100644 index 000000000..c7b3043cd --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h @@ -0,0 +1,8 @@ +#ifndef __MULTI_MARGIN_LOSS_METAX_API_H__ +#define __MULTI_MARGIN_LOSS_METAX_API_H__ + +#include "../multi_margin_loss.h" + +DESCRIPTOR(metax) + +#endif // __MULTI_MARGIN_LOSS_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca new file mode 100644 index 000000000..a5e133e7f --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca @@ -0,0 +1,304 @@ +#include "multi_margin_loss_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include + +// ================================================================== +// 1. MACA 类型兼容 +// ================================================================== +#if defined(__MACA__) || defined(__MACACC__) + #include + #include + using nv_bfloat16 = __maca_bfloat16; + using nv_bfloat162 = __maca_bfloat162; +#endif + +namespace op::multi_margin_loss::metax { + +// ================================================================== +// 2. Kernel 定义 +// ================================================================== + +// Functor: 核心数学逻辑 +struct MultiMarginLossFunctor { + int p; + float margin; + + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + : p(p_val), margin(margin_val) {} + + // 计算单个 class c 的 loss 分量 + __device__ __forceinline__ float compute(float diff) const { + if (diff > 0.0f) { + return (p == 1) ? diff : diff * diff; + } + return 0.0f; + } +}; + +// ------------------------------------------------------------------ +// Kernel 1: Elementwise 模式 (Reduction = None) +// ------------------------------------------------------------------ +template +__global__ void multi_margin_loss_kernel( + T * __restrict__ output, // [N] + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] (Optional) + size_t N, + size_t C, + MultiMarginLossFunctor functor) { + + size_t n = blockIdx.x * blockDim.x + threadIdx.x; + + if (n < N) { + int64_t target_idx = target[n]; + + // 越界检查 + if (target_idx < 0 || target_idx >= static_cast(C)) { + output[n] = static_cast(0.0f); + return; + } + + const T* row_ptr = input + n * C; + float target_score = static_cast(row_ptr[target_idx]); + float sum_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = static_cast(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sum_loss += functor.compute(diff); + } + + sum_loss /= static_cast(C); + + if (weight != nullptr) { + float w = static_cast(weight[target_idx]); + sum_loss *= w; + } + + output[n] = static_cast(sum_loss); + } +} + +// ------------------------------------------------------------------ +// Kernel 2: Reduction 模式 (Mean / Sum) +// ------------------------------------------------------------------ +template +__global__ void multi_margin_loss_reduce_kernel( + float * output, // [1] Accumulator (Float) + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] + size_t N, + size_t C, + MultiMarginLossFunctor functor, + float scale // Mean: 1/N, Sum: 1.0 +) { + // 声明 volatile 共享内存,防止编译器过度优化导致读取旧值 + // 大小固定为 256,对应 Launch Logic 中的 Block Size + __shared__ volatile float shared_mem[256]; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // 1. Grid-Stride Loop: 计算当前线程负责的所有样本的 Loss 总和 + for (size_t n = idx; n < N; n += stride) { + int64_t target_idx = target[n]; + + if (target_idx >= 0 && target_idx < static_cast(C)) { + const T* row_ptr = input + n * C; + float target_score = static_cast(row_ptr[target_idx]); + float sample_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = static_cast(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sample_loss += functor.compute(diff); + } + + sample_loss /= static_cast(C); + + if (weight != nullptr) { + float w = static_cast(weight[target_idx]); + sample_loss *= w; + } + + local_sum += sample_loss; + } + } + + // 2. 将线程局部结果存入 Shared Memory + unsigned int tid = threadIdx.x; + // 初始化整个 shared memory,即使线程数少于 256 也要保证安全 + if (tid < 256) { + shared_mem[tid] = local_sum; + } + __syncthreads(); + + // 3. Block 内树形归约 (Unrolled Tree Reduction) + if (tid < 128) { shared_mem[tid] += shared_mem[tid + 128]; } __syncthreads(); + if (tid < 64) { shared_mem[tid] += shared_mem[tid + 64]; } __syncthreads(); + if (tid < 32) { shared_mem[tid] += shared_mem[tid + 32]; } __syncthreads(); + if (tid < 16) { shared_mem[tid] += shared_mem[tid + 16]; } __syncthreads(); + if (tid < 8) { shared_mem[tid] += shared_mem[tid + 8]; } __syncthreads(); + if (tid < 4) { shared_mem[tid] += shared_mem[tid + 4]; } __syncthreads(); + if (tid < 2) { shared_mem[tid] += shared_mem[tid + 2]; } __syncthreads(); + if (tid < 1) { shared_mem[tid] += shared_mem[tid + 1]; } __syncthreads(); + + // 4. 将 Block 的结果原子累加到全局内存 + if (tid == 0) { + float block_sum = shared_mem[0]; + atomicAdd(output, block_sum * scale); + } +} + +// Kernel 3: 类型转换 (Float -> T) +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = static_cast(*src); +} + +// ================================================================== +// 3. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *target, + const void *weight, + void* workspace, + const MultiMarginLossInfo& info, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto tar_ptr = reinterpret_cast(target); + auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; + + auto mc_stream = reinterpret_cast(stream); + + size_t N = info.batch_size(); + size_t C = info.num_classes(); + int reduction = info.reduction(); + + MultiMarginLossFunctor functor(info.p(), info.margin()); + + // ------------------------------------------ + // Mode 1: Elementwise (Reduction = None) + // ------------------------------------------ + if (reduction == 0) { + size_t block_size = 256; + size_t grid_size = (N + block_size - 1) / block_size; + + multi_margin_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor + ); + } + // ------------------------------------------ + // Mode 2: Reduction (Mean / Sum) + // ------------------------------------------ + else { + // 使用 workspace 作为临时的 float 累加器 + float* acc_ptr = reinterpret_cast(workspace); + mcMemsetAsync(acc_ptr, 0, sizeof(float), mc_stream); + + float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum + + // 强制 Block Size 为 256 以匹配 Kernel 内的手写归约逻辑 + size_t block_size = 256; + size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); + if (grid_size == 0) grid_size = 1; + + multi_margin_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale + ); + + // 将 float 结果转回目标类型 T + cast_float_to_t + <<<1, 1, 0, mc_stream>>>(out_ptr, acc_ptr); + } +} + +// ================================================================== +// 4. Descriptor Implementation +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + int p, + float margin, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); + if (!info_result) return info_result.status(); + + // 如果需要归约,申请 4 字节 workspace 用于 atomicAdd + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::multi_margin_loss::metax \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h new file mode 100644 index 000000000..0f926a971 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h @@ -0,0 +1,8 @@ +#ifndef __MULTI_MARGIN_LOSS_MOORE_H__ +#define __MULTI_MARGIN_LOSS_MOORE_H__ + +#include "../multi_margin_loss.h" + +DESCRIPTOR(moore) + +#endif // __MULTI_MARGIN_LOSS_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu new file mode 100644 index 000000000..0bb529dc4 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu @@ -0,0 +1,158 @@ +#include "multi_margin_loss_moore.h" +#include "multi_margin_loss_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include + +namespace op::multi_margin_loss::moore { + +template +static inline bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *target, + const void *weight, + void* workspace, + const MultiMarginLossInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + // Target 在 Info 校验阶段已确保为 Int64 + auto tar_ptr = reinterpret_cast(target); + // Weight 是可选的 + auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; + + auto musa_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t N = info.batch_size(); + size_t C = info.num_classes(); + int reduction = info.reduction(); + + op::multi_margin_loss::moore::MultiMarginLossFunctor functor(info.p(), info.margin()); + + // ------------------------------------------ + // 模式 1: Elementwise (Reduction = None) + // ------------------------------------------ + if (reduction == 0) { + // 每个线程处理一个样本 N + size_t block_size = 256; + size_t grid_size = (N + block_size - 1) / block_size; + + op::multi_margin_loss::moore::multi_margin_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor + ); + } + // ------------------------------------------ + // 模式 2: Reduction (Mean / Sum) + // ------------------------------------------ + else { + // 使用 workspace 作为临时的 float 累加器 (精度更高,且方便 atomicAdd) + float* acc_ptr = reinterpret_cast(workspace); + musaMemsetAsync(acc_ptr, 0, sizeof(float), musa_stream); + float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum + + size_t block_size = 256; + size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); + + op::multi_margin_loss::moore::multi_margin_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale + ); + + // 将 float 累加结果转回 T 写入 output + op::multi_margin_loss::moore::cast_float_to_t + <<<1, 1, 0, musa_stream>>>(out_ptr, acc_ptr); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + int p, + float margin, + int reduction) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); + if (!info_result) return info_result.status(); + + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + // 检查 workspace 是否够用 + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::multi_margin_loss::moore \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h new file mode 100644 index 000000000..889eb1bc9 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h @@ -0,0 +1,195 @@ +#ifndef __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ +#define __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include +#include + +namespace op::multi_margin_loss::moore { + +template +struct alignas(sizeof(T) * N) Pack { + T val[N]; +}; + +// ================================================================== +// 类型转换辅助函数 (适配 MUSA) +// ================================================================== +template +__device__ __forceinline__ float to_float(T val) { + if constexpr (std::is_same_v) { + return __half2float(val); + } else if constexpr (std::is_same_v) { + return __bfloat162float(val); + } else { + return static_cast(val); + } +} + +template +__device__ __forceinline__ T from_float(float val) { + if constexpr (std::is_same_v) { + return __float2half(val); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(val); + } else { + return static_cast(val); + } +} + +// ================================================================== +// 归约辅助函数 (Warp & Block Reduction) +// ================================================================== +__device__ __forceinline__ float warpReduceSum(float val) { + unsigned int mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + return val; +} + +__device__ __forceinline__ float blockReduceSum(float val) { + static __shared__ float shared[32]; // Max 1024 threads / 32 warps + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) shared[wid] = val; + __syncthreads(); + + // 假设 BlockDim 也是 32 的倍数 + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; + if (wid == 0) val = warpReduceSum(val); + return val; +} + +// ================================================================== +// Functor: 核心数学逻辑 +// ================================================================== +struct MultiMarginLossFunctor { + int p; + float margin; + + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + : p(p_val), margin(margin_val) {} + + // 计算单个 class c 的 loss 分量 + // diff = margin - target_score + other_score + __device__ __forceinline__ float compute(float diff) const { + if (diff > 0.0f) { + return (p == 1) ? diff : diff * diff; + } + return 0.0f; + } +}; + +template +__global__ void multi_margin_loss_kernel( + T * __restrict__ output, // [N] + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] (Optional) + size_t N, + size_t C, + MultiMarginLossFunctor functor) { + + size_t n = blockIdx.x * blockDim.x + threadIdx.x; + + if (n < N) { + int64_t target_idx = target[n]; + + // 越界检查 + if (target_idx < 0 || target_idx >= static_cast(C)) { + output[n] = from_float(0.0f); + return; + } + + // 定位当前行的起始位置 + const T* row_ptr = input + n * C; + float target_score = to_float(row_ptr[target_idx]); + float sum_loss = 0.0f; + + // 遍历所有类别 + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = to_float(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sum_loss += functor.compute(diff); + } + + // 公式: sum / C + sum_loss /= static_cast(C); + + // 应用权重 + if (weight != nullptr) { + float w = to_float(weight[target_idx]); + sum_loss *= w; + } + + output[n] = from_float(sum_loss); + } +} + +template +__global__ void multi_margin_loss_reduce_kernel( + float * output, // [1] Accumulator (Float) + const T * __restrict__ input, // [N, C] + const int64_t * __restrict__ target, // [N] + const T * __restrict__ weight, // [C] + size_t N, + size_t C, + MultiMarginLossFunctor functor, + float scale // Mean模式传 1/N, Sum模式传 1.0 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // Grid-Stride Loop over Batch Dimension N + for (size_t n = idx; n < N; n += stride) { + int64_t target_idx = target[n]; + + if (target_idx >= 0 && target_idx < static_cast(C)) { + const T* row_ptr = input + n * C; + float target_score = to_float(row_ptr[target_idx]); + float sample_loss = 0.0f; + + for (size_t c = 0; c < C; ++c) { + if (c == static_cast(target_idx)) continue; + + float other_score = to_float(row_ptr[c]); + float diff = functor.margin - target_score + other_score; + sample_loss += functor.compute(diff); + } + + sample_loss /= static_cast(C); + + if (weight != nullptr) { + float w = to_float(weight[target_idx]); + sample_loss *= w; + } + + local_sum += sample_loss; + } + } + + // Block Reduction + float block_sum = blockReduceSum(local_sum); + + // Global Atomic Add (Reduce to scalar) + if (threadIdx.x == 0) { + atomicAdd(output, block_sum * scale); + } +} + +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = from_float(*src); +} + +} // namespace op::multi_margin_loss::moore + +#endif // __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h b/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h new file mode 100644 index 000000000..d19552855 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h @@ -0,0 +1,54 @@ +#ifndef __MULTI_MARGIN_LOSS_H__ +#define __MULTI_MARGIN_LOSS_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 MultiMarginLossInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::multi_margin_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MultiMarginLossInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + MultiMarginLossInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t target_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + int p, \ + float margin, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *target, \ + const void *weight, \ + void *stream) const; \ + }; \ + } + +#endif // __MULTI_MARGIN_LOSS_H__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu new file mode 100644 index 000000000..9cfeeebb1 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu @@ -0,0 +1,144 @@ +#include "multi_margin_loss_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::multi_margin_loss::nvidia { +template +static inline bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *target, + const void *weight, + void* workspace, + const MultiMarginLossInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + // Target 在 Info 校验阶段已确保为 Int64 + auto tar_ptr = reinterpret_cast(target); + // Weight 是可选的 + auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; + + auto cuda_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t N = info.batch_size(); + size_t C = info.num_classes(); + int reduction = info.reduction(); + + op::multi_margin_loss::cuda::MultiMarginLossFunctor functor(info.p(), info.margin()); + + // ------------------------------------------ + // 模式 1: Elementwise (Reduction = None) + // ------------------------------------------ + if (reduction == 0) { + // 每个线程处理一个样本 N + size_t block_size = 256; + size_t grid_size = (N + block_size - 1) / block_size; + + op::multi_margin_loss::cuda::multi_margin_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor + ); + } + // ------------------------------------------ + // 模式 2: Reduction (Mean / Sum) + // ------------------------------------------ + else { + // 使用 workspace 作为临时的 float 累加器 (精度更高,且方便 atomicAdd) + float* acc_ptr = reinterpret_cast(workspace); + cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream); + float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum + + size_t block_size = 256; + size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); + + op::multi_margin_loss::cuda::multi_margin_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale + ); + op::multi_margin_loss::cuda::cast_float_to_t + <<<1, 1, 0, cuda_stream>>>(out_ptr, acc_ptr); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + int p, + float margin, + int reduction) { + + auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); + if (!info_result) return info_result.status(); + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + // 检查 workspace 是否够用 + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, weight, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::multi_margin_loss::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh new file mode 100644 index 000000000..81e20fa53 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __MULTI_MARGIN_LOSS_NVIDIA_CUH__ +#define __MULTI_MARGIN_LOSS_NVIDIA_CUH__ + +#include "../multi_margin_loss.h" +DESCRIPTOR(nvidia) + +#endif // __MULTI_MARGIN_LOSS_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/multi_margin_loss/operator.cc b/src/infiniop/ops/multi_margin_loss/operator.cc new file mode 100644 index 000000000..a277f2415 --- /dev/null +++ b/src/infiniop/ops/multi_margin_loss/operator.cc @@ -0,0 +1,184 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/multi_margin_loss.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/multi_margin_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/multi_margin_loss_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/multi_margin_loss_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/multi_margin_loss_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateMultiMarginLossDescriptor( + infiniopHandle_t handle, + infiniopMultiMarginLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + infiniopTensorDescriptor_t weight, + int p, + float margin, + int reduction) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::multi_margin_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + target, \ + weight, \ + p, \ + margin, \ + reduction) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopMultiMarginLoss( + infiniopMultiMarginLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, target, weight, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} +__C infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc new file mode 100644 index 000000000..e3e893e38 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc @@ -0,0 +1,195 @@ +#include "scatter_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include // for memcpy +#include + +#include "../../../../utils.h" +#include "../../../../utils/custom_types.h" + +namespace op::scatter::cpu { +struct ScatterCpuOpaque { + std::vector updates_shape; + std::vector updates_strides; + std::vector output_strides; + std::vector indices_strides; + size_t input_total_bytes; + + ScatterCpuOpaque(const infiniopTensorDescriptor_t upd, + const infiniopTensorDescriptor_t indices, + const infiniopTensorDescriptor_t out) { + // 1. 几何信息 + const auto& u_shape = upd->shape(); + updates_shape.assign(u_shape.begin(), u_shape.end()); + + const auto& u_strides = upd->strides(); + updates_strides.assign(u_strides.begin(), u_strides.end()); + + const auto& i_strides = indices->strides(); + indices_strides.assign(i_strides.begin(), i_strides.end()); // <--- 记录 indices strides + + const auto& o_strides = out->strides(); + output_strides.assign(o_strides.begin(), o_strides.end()); + + size_t total_elements = 1; + for (auto s : out->shape()) total_elements *= s; + + size_t dtype_size = 0; + if (out->dtype() == INFINI_DTYPE_F32) dtype_size = 4; + else if (out->dtype() == INFINI_DTYPE_F64) dtype_size = 8; + else dtype_size = 2; // f16/bf16 + + input_total_bytes = total_elements * dtype_size; + } +}; + +struct Descriptor::Opaque : public ScatterCpuOpaque { + using ScatterCpuOpaque::ScatterCpuOpaque; +}; + +Descriptor::~Descriptor() { + if (_opaque) { delete _opaque; _opaque = nullptr; } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t indices_desc, + infiniopTensorDescriptor_t updates_desc, + int axis, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); + CHECK_RESULT(result); + + // 传入 indices_desc + auto opaque = new Opaque(updates_desc, indices_desc, out_desc); + + *desc_ptr = new Descriptor(opaque, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +inline void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { + for (int i = ndim - 1; i >= 0; --i) { + coords[i] = offset % shape[i]; + offset /= shape[i]; + } +} + +inline int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { + int64_t offset = 0; + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +template +void calculate_cpu_kernel( + const ScatterInfo &info, + const ScatterCpuOpaque *opaque, + void *output, + const void *indices, + const void *updates) { + + int axis = info.axis(); + int reduction = info.reduction(); + size_t ndim = info.ndim(); + + T* out_ptr = reinterpret_cast(output); + const IdxT* idx_ptr = reinterpret_cast(indices); + const T* upd_ptr = reinterpret_cast(updates); + + const int64_t* upd_shape_ptr = opaque->updates_shape.data(); + const int64_t* upd_strides_ptr = opaque->updates_strides.data(); + const int64_t* idx_strides_ptr = opaque->indices_strides.data(); // <--- 使用 indices strides + const int64_t* out_strides_ptr = opaque->output_strides.data(); + + size_t total_elements = 1; + for (auto s : opaque->updates_shape) total_elements *= s; + + // Serial loop + for (size_t i = 0; i < total_elements; ++i) { + std::vector coords(ndim); + offset_to_coords(static_cast(i), ndim, upd_shape_ptr, coords.data()); + + int64_t upd_offset = coords_to_offset(ndim, coords.data(), upd_strides_ptr); + int64_t idx_offset = coords_to_offset(ndim, coords.data(), idx_strides_ptr); + + T upd_val = upd_ptr[upd_offset]; + IdxT idx_val = idx_ptr[idx_offset]; + + coords[axis] = static_cast(idx_val); + + int64_t out_offset = coords_to_offset(ndim, coords.data(), out_strides_ptr); + + if (reduction == 0) { + out_ptr[out_offset] = upd_val; + } else if (reduction == 1) { + float val_out = utils::cast(out_ptr[out_offset]); + float val_upd = utils::cast(upd_val); + out_ptr[out_offset] = utils::cast(val_out + val_upd); + } else if (reduction == 2) { + float val_out = utils::cast(out_ptr[out_offset]); + float val_upd = utils::cast(upd_val); + out_ptr[out_offset] = utils::cast(val_out * val_upd); + } + } +} + +template +void calculate_cpu_impl( + const ScatterInfo &info, + const ScatterCpuOpaque *opaque, + void *output, + const void *input, // 需要 input 指针 + const void *indices, + const void *updates) { + + if (input != output) { + std::memcpy(output, input, opaque->input_total_bytes); + } + if (info.idx_dtype() == INFINI_DTYPE_I32) { + calculate_cpu_kernel(info, opaque, output, indices, updates); + } else { + calculate_cpu_kernel(info, opaque, output, indices, updates); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream) const { + + auto dtype = _info.dtype(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, _opaque, output, input, indices, updates); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, _opaque, output, input, indices, updates); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, _opaque, output, input, indices, updates); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, _opaque, output, input, indices, updates); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::scatter::cpu \ No newline at end of file diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h new file mode 100644 index 000000000..6f77c4b8f --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_CPU_H__ +#define __SCATTER_CPU_H__ + +#include "../scatter.h" + +DESCRIPTOR(cpu) + +#endif // __SCATTER_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh new file mode 100644 index 000000000..2fe22bc89 --- /dev/null +++ b/src/infiniop/ops/scatter/cuda/kernel.cuh @@ -0,0 +1,95 @@ +#ifndef __SCATTER_CUDA_CUH__ +#define __SCATTER_CUDA_CUH__ + +#include +#include +#include +using nv_bfloat16 = __nv_bfloat16; + + +#include +#include +#include + +namespace op::scatter::cuda { + +constexpr int MAX_DIMS = 8; + +struct TensorGeometry { + int ndim; + int64_t updates_shape[MAX_DIMS]; + int64_t updates_strides[MAX_DIMS]; + int64_t output_strides[MAX_DIMS]; + int64_t indices_strides[MAX_DIMS]; +}; +__device__ __forceinline__ float to_float(float val) { return val; } +__device__ __forceinline__ float to_float(double val) { return static_cast(val); } +__device__ __forceinline__ float to_float(half val) { return __half2float(val); } +__device__ __forceinline__ float to_float(nv_bfloat16 val) { return __bfloat162float(val); } + +template __device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> __device__ __forceinline__ half from_float(float val) { return __float2half(val); } +template <> __device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } + +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + coords[i] = offset % shape[i]; + offset /= shape[i]; + } +} + +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { + int64_t offset = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +template +__global__ void scatter_kernel( + T * __restrict__ output, + const T * __restrict__ updates, + const IdxT * __restrict__ indices, + TensorGeometry geometry, + int axis, + int reduction, + size_t num_updates) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + int64_t coords[MAX_DIMS]; + + for (size_t i = idx; i < num_updates; i += stride) { + offset_to_coords(static_cast(i), geometry.ndim, geometry.updates_shape, coords); + + int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); + T upd_val = updates[upd_offset]; + + // FIX: 使用 indices_strides 计算 offset + int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); + IdxT idx_val = indices[idx_offset]; + + coords[axis] = static_cast(idx_val); + int64_t out_offset = coords_to_offset(geometry.ndim, coords, geometry.output_strides); + + if (reduction == 0) { + output[out_offset] = upd_val; + } else if (reduction == 1) { + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing + update); + } else if (reduction == 2) { + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing * update); + } + } +} + +} // namespace op::scatter::cuda + +#endif // __SCATTER_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h new file mode 100644 index 000000000..d0347107c --- /dev/null +++ b/src/infiniop/ops/scatter/info.h @@ -0,0 +1,97 @@ +#ifndef __SCATTER_INFO_H__ +#define __SCATTER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::scatter { + +class ScatterInfo { + ScatterInfo() = default; + +public: + int _dtype; + int _idx_dtype; + int _axis; + int _reduction; + size_t _ndim; + + int dtype() const { return _dtype; } + int idx_dtype() const { return _idx_dtype; } + int axis() const { return _axis; } + int reduction() const { return _reduction; } + size_t ndim() const { return _ndim; } + + ScatterInfo(int dtype, int idx_dtype, int axis, int reduction, size_t ndim) + : _dtype(dtype), _idx_dtype(idx_dtype), _axis(axis), _reduction(reduction), _ndim(ndim) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t indices_desc, + infiniopTensorDescriptor_t updates_desc, + int axis, + int reduction) { + + size_t ndim = input_desc->ndim(); + if (out_desc->ndim() != ndim || indices_desc->ndim() != ndim || updates_desc->ndim() != ndim) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + int canonical_axis = axis; + if (canonical_axis < 0) { + canonical_axis += static_cast(ndim); + } + if (canonical_axis < 0 || canonical_axis >= static_cast(ndim)) { + return INFINI_STATUS_BAD_PARAM; + } + + const auto& in_shape = input_desc->shape(); + const auto& out_shape = out_desc->shape(); + for (size_t i = 0; i < ndim; ++i) { + if (in_shape[i] != out_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + const auto& idx_shape = indices_desc->shape(); + const auto& upd_shape = updates_desc->shape(); + for (size_t i = 0; i < ndim; ++i) { + if (idx_shape[i] != upd_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + for (size_t i = 0; i < ndim; ++i) { + if (idx_shape[i] > in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + if (input_desc->dtype() != updates_desc->dtype() || + input_desc->dtype() != out_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (indices_desc->dtype() != INFINI_DTYPE_I32 && indices_desc->dtype() != INFINI_DTYPE_I64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (reduction < 0 || reduction > 2) { + return INFINI_STATUS_BAD_PARAM; + } + + return utils::Result(ScatterInfo{ + input_desc->dtype(), + indices_desc->dtype(), + canonical_axis, + reduction, + ndim + }); + } +}; + +} // namespace op::scatter + +#endif // __SCATTER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h new file mode 100644 index 000000000..9ebfae3b2 --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_METAX_API_H__ +#define __SCATTER_METAX_API_H__ + +#include "../scatter.h" + +DESCRIPTOR(metax) + +#endif // __SCATTER_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca new file mode 100644 index 000000000..04b8e30e8 --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca @@ -0,0 +1,279 @@ +#include "scatter_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include +#include +#include +using nv_bfloat16 = __maca_bfloat16; + +namespace op::scatter::metax { + +constexpr int MAX_DIMS = 8; + +struct TensorGeometry { + int ndim; + int64_t updates_shape[MAX_DIMS]; + int64_t updates_strides[MAX_DIMS]; + int64_t output_strides[MAX_DIMS]; + int64_t indices_strides[MAX_DIMS]; +}; + +// 类型转换辅助函数 +__device__ __forceinline__ float to_float(float val) { return val; } +__device__ __forceinline__ float to_float(double val) { return static_cast(val); } +__device__ __forceinline__ float to_float(__half val) { return __half2float(val); } +__device__ __forceinline__ float to_float(nv_bfloat16 val) { return __bfloat162float(val); } + +template __device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> __device__ __forceinline__ __half from_float<__half>(float val) { return __float2half(val); } +template <> __device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } + +// 坐标变换辅助函数 +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + coords[i] = offset % shape[i]; + offset /= shape[i]; + } +} + +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { + int64_t offset = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +// Scatter Kernel +template +__global__ void scatter_kernel( + T * __restrict__ output, + const T * __restrict__ updates, + const IdxT * __restrict__ indices, + TensorGeometry geometry, + int axis, + int reduction, + size_t num_updates) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + int64_t coords[MAX_DIMS]; + + for (size_t i = idx; i < num_updates; i += stride) { + // 1. 计算 updates 的多维坐标 + offset_to_coords(static_cast(i), geometry.ndim, geometry.updates_shape, coords); + + // 2. 读取 update 值 + int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); + T upd_val = updates[upd_offset]; + + // 3. 读取 index 值 (注意:使用 indices_strides) + int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); + IdxT idx_val = indices[idx_offset]; + + // 4. 计算 output 的多维坐标 (替换指定 axis 的索引) + coords[axis] = static_cast(idx_val); + int64_t out_offset = coords_to_offset(geometry.ndim, coords, geometry.output_strides); + + // 5. 执行 Scatter 操作 (None, Add, Mul) + if (reduction == 0) { + output[out_offset] = upd_val; + } else if (reduction == 1) { // Add + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing + update); + } else if (reduction == 2) { // Mul + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing * update); + } + } +} + +// ================================================================== +// 3. Opaque 结构体 +// ================================================================== +struct ScatterMetaxOpaque { + TensorGeometry geometry; + size_t input_bytes; + + ScatterMetaxOpaque(const infiniopTensorDescriptor_t updates_desc, + const infiniopTensorDescriptor_t indices_desc, + const infiniopTensorDescriptor_t output_desc) { + + geometry.ndim = static_cast(updates_desc->ndim()); + + // 计算 Input 字节数 (用于拷贝) + size_t total_elements = 1; + for(size_t i=0; indim(); ++i) { + total_elements *= output_desc->shape()[i]; + } + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; + else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; + else dt_size = 2; // f16/bf16 + + input_bytes = total_elements * dt_size; + + // 填充 Geometry + int ndim = geometry.ndim; + for(int i=0; ishape()[i]; + geometry.updates_strides[i] = updates_desc->strides()[i]; + geometry.output_strides[i] = output_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; + } + } +}; + +struct Descriptor::Opaque : public ScatterMetaxOpaque { + using ScatterMetaxOpaque::ScatterMetaxOpaque; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +// ================================================================== +// 4. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *updates, + const void *indices, + const ScatterMetaxOpaque* opaque, + const ScatterInfo& info, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto upd_ptr = reinterpret_cast(updates); + auto idx_ptr = reinterpret_cast(indices); + auto mc_stream = reinterpret_cast(stream); + + size_t num_updates = 1; + for(int i=0; igeometry.ndim; ++i) { + num_updates *= opaque->geometry.updates_shape[i]; + } + + if (num_updates == 0) return; + + size_t block_size = 256; + size_t grid_size = (num_updates + block_size - 1) / block_size; + // 限制 grid size,防止溢出 + grid_size = std::min(grid_size, static_cast(2147483647)); + + scatter_kernel + <<>>( + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates + ); +} + +// ================================================================== +// 5. Descriptor Create +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t indices_desc, + infiniopTensorDescriptor_t updates_desc, + int axis, + int reduction) { + + auto handle_ptr = reinterpret_cast(handle); + auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); + if (!info_result) return info_result.status(); + + if (out_desc->ndim() > MAX_DIMS) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto opaque = new Opaque(updates_desc, indices_desc, out_desc); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor(opaque, info_result.take(), workspace_size, handle_ptr->device, handle_ptr->device_id); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// 6. Calculate Dispatch +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream) const { + + auto mc_stream = reinterpret_cast(stream); + + // 1. Input -> Output 拷贝 (Scatter 通常是 In-place 语义的变体) + if (input != output) { + mcMemcpyAsync(output, input, _opaque->input_bytes, mcMemcpyDeviceToDevice, mc_stream); + } + + // 2. 启动 Kernel + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + switch (dtype) { + case INFINI_DTYPE_F16: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel<__half, int32_t>(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel<__half, int64_t>(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } +#endif + break; + + case INFINI_DTYPE_F32: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_F64: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::scatter::metax \ No newline at end of file diff --git a/src/infiniop/ops/scatter/moore/scatter_moore.h b/src/infiniop/ops/scatter/moore/scatter_moore.h new file mode 100644 index 000000000..e09580c4f --- /dev/null +++ b/src/infiniop/ops/scatter/moore/scatter_moore.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_MOORE_H__ +#define __SCATTER_MOORE_H__ + +#include "../scatter.h" + +DESCRIPTOR(moore) + +#endif // __SCATTER_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/moore/scatter_moore.mu b/src/infiniop/ops/scatter/moore/scatter_moore.mu new file mode 100644 index 000000000..82bb7ee60 --- /dev/null +++ b/src/infiniop/ops/scatter/moore/scatter_moore.mu @@ -0,0 +1,186 @@ +#include "scatter_moore.h" +#include "scatter_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include +#include + +namespace op::scatter::moore { + +// ================================================================== +// 1. Common Opaque Structure +// ================================================================== +struct ScatterMooreOpaque { + op::scatter::moore::TensorGeometry geometry; + size_t input_bytes; + + ScatterMooreOpaque(const infiniopTensorDescriptor_t updates_desc, + const infiniopTensorDescriptor_t indices_desc, + const infiniopTensorDescriptor_t output_desc) { + + geometry.ndim = static_cast(updates_desc->ndim()); + + // Calculate Input bytes for copy + size_t total_elements = 1; + for(size_t i=0; indim(); ++i) { + total_elements *= output_desc->shape()[i]; + } + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; + else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; + else dt_size = 2; // f16/bf16 + + input_bytes = total_elements * dt_size; + + // Fill Geometry + int ndim = geometry.ndim; + for(int i=0; ishape()[i]; + geometry.updates_strides[i] = updates_desc->strides()[i]; + geometry.output_strides[i] = output_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; + } + } +}; + +struct Descriptor::Opaque : public ScatterMooreOpaque { + using ScatterMooreOpaque::ScatterMooreOpaque; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *updates, + const void *indices, + const ScatterMooreOpaque* opaque, + const ScatterInfo& info, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto upd_ptr = reinterpret_cast(updates); + auto idx_ptr = reinterpret_cast(indices); + auto musa_stream = reinterpret_cast(stream); + + size_t num_updates = 1; + for(int i=0; igeometry.ndim; ++i) { + num_updates *= opaque->geometry.updates_shape[i]; + } + + if (num_updates == 0) return; + + size_t block_size = 256; + size_t grid_size = (num_updates + block_size - 1) / block_size; + // MUSA grid dimension limit check (usually same as CUDA) + grid_size = std::min(grid_size, static_cast(2147483647)); + + op::scatter::moore::scatter_kernel + <<>>( + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates + ); +} + +// ================================================================== +// Descriptor Create +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t indices_desc, + infiniopTensorDescriptor_t updates_desc, + int axis, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); + if (!info_result) return info_result.status(); + + if (out_desc->ndim() > op::scatter::moore::MAX_DIMS) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto opaque = new Opaque(updates_desc, indices_desc, out_desc); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor(opaque, info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Calculate Dispatch +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream) const { + + auto musa_stream = reinterpret_cast(stream); + + // 1. Copy Input -> Output (if different) + if (input != output) { + musaMemcpyAsync(output, input, _opaque->input_bytes, musaMemcpyDeviceToDevice, musa_stream); + } + + // 2. Launch Kernel + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + switch (dtype) { + case INFINI_DTYPE_F16: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_BF16: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel<__mt_bfloat16, int32_t>(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel<__mt_bfloat16, int64_t>(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_F32: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_F64: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::scatter::moore \ No newline at end of file diff --git a/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h b/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h new file mode 100644 index 000000000..e346c5164 --- /dev/null +++ b/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h @@ -0,0 +1,103 @@ +#ifndef __SCATTER_MOORE_KERNEL_H__ +#define __SCATTER_MOORE_KERNEL_H__ + +#include +#include +#include + +#include +#include +#include + +namespace op::scatter::moore { + +constexpr int MAX_DIMS = 8; + +struct TensorGeometry { + int ndim; + int64_t updates_shape[MAX_DIMS]; + int64_t updates_strides[MAX_DIMS]; + int64_t output_strides[MAX_DIMS]; + int64_t indices_strides[MAX_DIMS]; +}; +__device__ __forceinline__ float to_float(float val) { return val; } +__device__ __forceinline__ float to_float(double val) { return static_cast(val); } +__device__ __forceinline__ float to_float(half val) { return __half2float(val); } +__device__ __forceinline__ float to_float(__mt_bfloat16 val) { return __bfloat162float(val); } + +template __device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> __device__ __forceinline__ half from_float(float val) { return __float2half(val); } +template <> __device__ __forceinline__ __mt_bfloat16 from_float<__mt_bfloat16>(float val) { return __float2bfloat16(val); } + +// ================================================================== +// 坐标/偏移计算逻辑 (保持不变) +// ================================================================== + +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + coords[i] = offset % shape[i]; + offset /= shape[i]; + } +} + +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { + int64_t offset = 0; + #pragma unroll + for (int i = 0; i < ndim; ++i) { + offset += coords[i] * strides[i]; + } + return offset; +} + +// ================================================================== +// Scatter Kernel 实现 +// ================================================================== + +template +__global__ void scatter_kernel( + T * __restrict__ output, + const T * __restrict__ updates, + const IdxT * __restrict__ indices, + TensorGeometry geometry, + int axis, + int reduction, + size_t num_updates) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + + int64_t coords[MAX_DIMS]; + + for (size_t i = idx; i < num_updates; i += stride) { + // 1. 根据 updates 的线性索引反推多维坐标 + offset_to_coords(static_cast(i), geometry.ndim, geometry.updates_shape, coords); + + // 2. 获取 updates 中的值 + int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); + T upd_val = updates[upd_offset]; + + // 3. 获取对应的 indices 值 (使用 indices_strides) + int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); + IdxT idx_val = indices[idx_offset]; + + // 4. 将坐标中的 axis 维度替换为 index 的值,计算输出偏移 + coords[axis] = static_cast(idx_val); + int64_t out_offset = coords_to_offset(geometry.ndim, coords, geometry.output_strides); + if (reduction == 0) { // None + output[out_offset] = upd_val; + } else if (reduction == 1) { // Add + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing + update); + } else if (reduction == 2) { // Multiply + float existing = to_float(output[out_offset]); + float update = to_float(upd_val); + output[out_offset] = from_float(existing * update); + } + } +} + +} // namespace op::scatter::moore + +#endif // __SCATTER_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu new file mode 100644 index 000000000..6d8836de7 --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu @@ -0,0 +1,185 @@ +#include "scatter_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include +#include + +namespace op::scatter::nvidia { + +// ================================================================== +// 1. 公共 Opaque 结构体 +// ================================================================== +struct ScatterNvidiaOpaque { + op::scatter::cuda::TensorGeometry geometry; + size_t input_bytes; + + ScatterNvidiaOpaque(const infiniopTensorDescriptor_t updates_desc, + const infiniopTensorDescriptor_t indices_desc, + const infiniopTensorDescriptor_t output_desc) { + + geometry.ndim = static_cast(updates_desc->ndim()); + + // 计算 Input 字节数 + size_t total_elements = 1; + for(size_t i=0; indim(); ++i) { + total_elements *= output_desc->shape()[i]; + } + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; + else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; + else dt_size = 2; // f16/bf16 + + input_bytes = total_elements * dt_size; + + // 填充 Geometry + int ndim = geometry.ndim; + for(int i=0; ishape()[i]; + geometry.updates_strides[i] = updates_desc->strides()[i]; + geometry.output_strides[i] = output_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; + } + } +}; + +struct Descriptor::Opaque : public ScatterNvidiaOpaque { + using ScatterNvidiaOpaque::ScatterNvidiaOpaque; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *updates, + const void *indices, + const ScatterNvidiaOpaque* opaque, + const ScatterInfo& info, + void *stream) { + + auto out_ptr = reinterpret_cast(output); + auto upd_ptr = reinterpret_cast(updates); + auto idx_ptr = reinterpret_cast(indices); + auto cuda_stream = reinterpret_cast(stream); + + size_t num_updates = 1; + for(int i=0; igeometry.ndim; ++i) { + num_updates *= opaque->geometry.updates_shape[i]; + } + + if (num_updates == 0) return; + + size_t block_size = 256; + size_t grid_size = (num_updates + block_size - 1) / block_size; + grid_size = std::min(grid_size, static_cast(2147483647)); + + op::scatter::cuda::scatter_kernel + <<>>( + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates + ); +} + +// ================================================================== +// Descriptor Create +// ================================================================== +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t indices_desc, + infiniopTensorDescriptor_t updates_desc, + int axis, + int reduction) { + + auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); + if (!info_result) return info_result.status(); + + if (out_desc->ndim() > op::scatter::cuda::MAX_DIMS) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 传入 indices_desc + auto opaque = new Opaque(updates_desc, indices_desc, out_desc); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor(opaque, info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +// ================================================================== +// Calculate Dispatch +// ================================================================== +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + // 1. 关键修复:Input -> Output 拷贝 + if (input != output) { + cudaMemcpyAsync(output, input, _opaque->input_bytes, cudaMemcpyDeviceToDevice, cuda_stream); + } + + // 2. 启动 Kernel + auto dtype = _info.dtype(); + auto idx_dtype = _info.idx_dtype(); + + switch (dtype) { + case INFINI_DTYPE_F16: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_BF16: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_F32: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + case INFINI_DTYPE_F64: + if (idx_dtype == INFINI_DTYPE_I32) { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } else { + launch_kernel(output, updates, indices, _opaque, _info, stream); + } + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::scatter::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh new file mode 100644 index 000000000..448321cb2 --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SCATTER_NVIDIA_CUH__ +#define __SCATTER_NVIDIA_CUH__ + +#include "../scatter.h" + +DESCRIPTOR(nvidia) + +#endif // __SCATTER_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc new file mode 100644 index 000000000..4236100b0 --- /dev/null +++ b/src/infiniop/ops/scatter/operator.cc @@ -0,0 +1,186 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/scatter.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/scatter_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/scatter_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/scatter_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/scatter_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + infiniopTensorDescriptor_t updates, + int axis, + int reduction) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::scatter::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + indices, \ + updates, \ + axis, \ + reduction) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopScatter( + infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, indices, updates, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h new file mode 100644 index 000000000..8cf6c239d --- /dev/null +++ b/src/infiniop/ops/scatter/scatter.h @@ -0,0 +1,53 @@ +#ifndef __SCATTER_H__ +#define __SCATTER_H__ + +#include "../../operator.h" +#include "info.h" + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::scatter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ScatterInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ScatterInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output, \ + infiniopTensorDescriptor_t input, \ + infiniopTensorDescriptor_t indices, \ + infiniopTensorDescriptor_t updates, \ + int axis, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *indices, \ + const void *updates, \ + void *stream) const; \ + }; \ + } + +#endif // __SCATTER_H__ \ No newline at end of file diff --git a/test/infinicore/ops/flipud.py b/test/infinicore/ops/flipud.py index b92762f49..9af264d11 100644 --- a/test/infinicore/ops/flipud.py +++ b/test/infinicore/ops/flipud.py @@ -72,9 +72,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.flipud(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.flipud(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.flipud(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/float_power.py b/test/infinicore/ops/float_power.py index 2548c57ac..ca2568aa8 100644 --- a/test/infinicore/ops/float_power.py +++ b/test/infinicore/ops/float_power.py @@ -112,9 +112,8 @@ def torch_operator(self, *args, **kwargs): return torch.float_power(*args, **kwargs) -# def infinicore_operator(self, *args, **kwargs): -# """InfiniCore implementation (operator not yet available).""" -# return infinicore.float_power(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.float_power(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/floor_divide.py b/test/infinicore/ops/floor_divide.py index 9aaebfe62..b7d83ebfe 100644 --- a/test/infinicore/ops/floor_divide.py +++ b/test/infinicore/ops/floor_divide.py @@ -102,9 +102,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.floor_divide(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.floor_divide(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.floor_divide(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/multi_margin_loss.py b/test/infinicore/ops/multi_margin_loss.py index 0e12bd608..df2d3970d 100644 --- a/test/infinicore/ops/multi_margin_loss.py +++ b/test/infinicore/ops/multi_margin_loss.py @@ -115,7 +115,7 @@ def torch_operator(self, *args, **kwargs): def infinicore_operator(self, *args, **kwargs): """InfiniCore multi_margin_loss implementation""" - return None + return infinicore.nn.functional.multi_margin_loss(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/scatter.py b/test/infinicore/ops/scatter.py index 31e6a4c8d..aa2b234f9 100644 --- a/test/infinicore/ops/scatter.py +++ b/test/infinicore/ops/scatter.py @@ -85,9 +85,8 @@ def torch_operator(self, *args, **kwargs): return torch.scatter(inp, dim, idx, src) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.scatter(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.scatter(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/triplet_margin_loss.py b/test/infinicore/ops/triplet_margin_loss.py index f25a04245..0176c07f8 100644 --- a/test/infinicore/ops/triplet_margin_loss.py +++ b/test/infinicore/ops/triplet_margin_loss.py @@ -73,9 +73,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.triplet_margin_loss(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.triplet_margin_loss(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.triplet_margin_loss(*args, **kwargs) def main(): From 525d1359a39264d0a0d6a31a9e5130cd2b6cf574 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Mon, 23 Mar 2026 17:03:24 +0800 Subject: [PATCH 2/2] issue/1031 fix T1-1-38 --- include/infinicore/ops/flipud.hpp | 4 +- include/infinicore/ops/float_power.hpp | 4 +- include/infinicore/ops/floor_divide.hpp | 2 +- include/infinicore/ops/multi_margin_loss.hpp | 4 +- include/infinicore/ops/scatter.hpp | 4 +- include/infiniop.h | 10 +- include/infiniop/ops/flipud.h | 26 +-- include/infiniop/ops/float_power.h | 32 ++-- include/infiniop/ops/floor_divide.h | 30 ++-- include/infiniop/ops/multi_margin_loss.h | 40 ++--- include/infiniop/ops/scatter.h | 38 ++-- python/infinicore/__init__.py | 6 +- python/infinicore/nn/functional/__init__.py | 3 +- .../nn/functional/multi_margin_loss.py | 3 +- python/infinicore/ops/flipud.py | 4 +- python/infinicore/ops/float_power.py | 4 +- python/infinicore/ops/floor_divide.py | 2 +- python/infinicore/ops/scatter.py | 3 +- src/infinicore/ops/flipud/flipud.cc | 4 +- src/infinicore/ops/flipud/flipud_infiniop.cc | 22 ++- src/infinicore/ops/float_power/float_power.cc | 20 +-- .../ops/float_power/float_power_infiniop.cc | 36 ++-- .../ops/floor_divide/floor_divide.cc | 2 +- .../ops/floor_divide/floor_divide_infiniop.cc | 4 +- .../multi_margin_loss/multi_margin_loss.cc | 6 +- .../multi_margin_loss_infiniop.cc | 34 ++-- src/infinicore/ops/scatter/scatter.cc | 4 +- .../ops/scatter/scatter_infiniop.cc | 26 ++- src/infinicore/pybind11/ops.hpp | 10 +- src/infinicore/pybind11/ops/flipud.hpp | 4 +- src/infinicore/pybind11/ops/float_power.hpp | 54 +++--- src/infinicore/pybind11/ops/floor_divide.hpp | 2 +- .../pybind11/ops/multi_margin_loss.hpp | 16 +- src/infinicore/pybind11/ops/scatter.hpp | 18 +- src/infiniop/ops/flipud/cpu/flipud_cpu.cc | 61 ++++--- src/infiniop/ops/flipud/cpu/flipud_cpu.h | 2 +- src/infiniop/ops/flipud/cuda/kernel.cuh | 41 +++-- src/infiniop/ops/flipud/flipud.h | 78 ++++----- src/infiniop/ops/flipud/info.h | 11 +- src/infiniop/ops/flipud/metax/flipud_metax.h | 2 +- .../ops/flipud/metax/flipud_metax.maca | 116 +++++++------ src/infiniop/ops/flipud/moore/flipud_moore.h | 2 +- src/infiniop/ops/flipud/moore/flipud_moore.mu | 47 +++-- .../ops/flipud/moore/flipud_moore_kernel.h | 43 +++-- .../ops/flipud/nvidia/flipud_nvidia.cu | 50 +++--- .../ops/flipud/nvidia/flipud_nvidia.cuh | 2 +- src/infiniop/ops/flipud/operator.cc | 152 ++++++++-------- .../ops/float_power/cpu/float_power_cpu.cc | 65 ++++--- .../ops/float_power/cpu/float_power_cpu.h | 2 +- src/infiniop/ops/float_power/cuda/kernel.cuh | 74 ++++---- src/infiniop/ops/float_power/float_power.h | 88 +++++----- src/infiniop/ops/float_power/info.h | 38 ++-- .../ops/float_power/metax/float_power_metax.h | 2 +- .../float_power/metax/float_power_metax.maca | 130 +++++++------- .../ops/float_power/moore/float_power_moore.h | 2 +- .../float_power/moore/float_power_moore.mu | 43 +++-- .../moore/float_power_moore_kernel.h | 76 ++++---- .../float_power/nvidia/float_power_nvidia.cu | 45 +++-- .../float_power/nvidia/float_power_nvidia.cuh | 2 +- src/infiniop/ops/float_power/operator.cc | 162 ++++++++--------- .../ops/floor_divide/cpu/floor_divide_cpu.cc | 2 +- .../ops/floor_divide/cpu/floor_divide_cpu.h | 4 +- src/infiniop/ops/floor_divide/cuda/kernel.cuh | 2 +- .../floor_divide/metax/floor_divide_metax.h | 2 +- .../metax/floor_divide_metax.maca | 16 +- .../floor_divide/moore/floor_divide_moore.h | 2 +- .../floor_divide/moore/floor_divide_moore.mu | 4 +- .../moore/floor_divide_moore_kernel.h | 6 +- .../nvidia/floor_divide_nvidia.cu | 2 +- .../nvidia/floor_divide_nvidia.cuh | 2 +- src/infiniop/ops/floor_divide/operator.cc | 34 ++-- .../cpu/multi_margin_loss_cpu.cc | 37 ++-- .../cpu/multi_margin_loss_cpu.h | 2 +- .../ops/multi_margin_loss/cuda/kernel.cuh | 59 ++++--- src/infiniop/ops/multi_margin_loss/info.h | 8 +- .../metax/multi_margin_loss_metax.h | 2 +- .../metax/multi_margin_loss_metax.maca | 164 +++++++++++------- .../moore/multi_margin_loss_moore.h | 2 +- .../moore/multi_margin_loss_moore.mu | 79 ++++----- .../moore/multi_margin_loss_moore_kernel.h | 61 ++++--- .../ops/multi_margin_loss/multi_margin_loss.h | 90 +++++----- .../nvidia/multi_margin_loss_nvidia.cu | 64 +++---- .../nvidia/multi_margin_loss_nvidia.cuh | 2 +- .../ops/multi_margin_loss/operator.cc | 162 ++++++++--------- src/infiniop/ops/scatter/cpu/scatter_cpu.cc | 79 +++++---- src/infiniop/ops/scatter/cpu/scatter_cpu.h | 2 +- src/infiniop/ops/scatter/cuda/kernel.cuh | 38 ++-- src/infiniop/ops/scatter/info.h | 24 ++- .../ops/scatter/metax/scatter_metax.h | 2 +- .../ops/scatter/metax/scatter_metax.maca | 132 +++++++------- .../ops/scatter/moore/scatter_moore.h | 2 +- .../ops/scatter/moore/scatter_moore.mu | 103 ++++++----- .../ops/scatter/moore/scatter_moore_kernel.h | 35 ++-- .../ops/scatter/nvidia/scatter_nvidia.cu | 101 ++++++----- .../ops/scatter/nvidia/scatter_nvidia.cuh | 2 +- src/infiniop/ops/scatter/operator.cc | 160 ++++++++--------- src/infiniop/ops/scatter/scatter.h | 90 +++++----- 97 files changed, 1745 insertions(+), 1650 deletions(-) diff --git a/include/infinicore/ops/flipud.hpp b/include/infinicore/ops/flipud.hpp index 9f00cf71c..7e449f9f5 100644 --- a/include/infinicore/ops/flipud.hpp +++ b/include/infinicore/ops/flipud.hpp @@ -9,11 +9,11 @@ class Flipud { public: // Schema signature: (Output, Input) using schema = void (*)(Tensor, Tensor); - + static void execute(Tensor output, Tensor input); static common::OpDispatcher &dispatcher(); }; Tensor flipud(Tensor input); void flipud_(Tensor output, Tensor input); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/float_power.hpp b/include/infinicore/ops/float_power.hpp index 69e0586a1..c461db2eb 100644 --- a/include/infinicore/ops/float_power.hpp +++ b/include/infinicore/ops/float_power.hpp @@ -37,8 +37,8 @@ class FloatPower { // Dispatchers // ========================================================== - static common::OpDispatcher& dispatcher_scalar(); - static common::OpDispatcher& dispatcher_tensor(); + static common::OpDispatcher &dispatcher_scalar(); + static common::OpDispatcher &dispatcher_tensor(); }; // ======================================================================= diff --git a/include/infinicore/ops/floor_divide.hpp b/include/infinicore/ops/floor_divide.hpp index 836652d76..43267dce6 100644 --- a/include/infinicore/ops/floor_divide.hpp +++ b/include/infinicore/ops/floor_divide.hpp @@ -13,4 +13,4 @@ class FloorDivide { Tensor floor_divide(Tensor a, Tensor b); void floor_divide_(Tensor c, Tensor a, Tensor b); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/multi_margin_loss.hpp b/include/infinicore/ops/multi_margin_loss.hpp index a1b297114..e36cf469b 100644 --- a/include/infinicore/ops/multi_margin_loss.hpp +++ b/include/infinicore/ops/multi_margin_loss.hpp @@ -8,7 +8,7 @@ namespace infinicore::op { class MultiMarginLoss { public: using schema = void (*)(Tensor, Tensor, Tensor, Tensor, int64_t, float, int64_t); - + static void execute(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction); static common::OpDispatcher &dispatcher(); }; @@ -16,4 +16,4 @@ class MultiMarginLoss { Tensor multi_margin_loss(Tensor input, Tensor target, Tensor weight = {}, int64_t p = 1, float margin = 1.0f, int64_t reduction = 1); void multi_margin_loss_(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/scatter.hpp b/include/infinicore/ops/scatter.hpp index a9efe6ca2..306bb1232 100644 --- a/include/infinicore/ops/scatter.hpp +++ b/include/infinicore/ops/scatter.hpp @@ -8,7 +8,7 @@ namespace infinicore::op { class Scatter { public: using schema = void (*)(Tensor, Tensor, int64_t, Tensor, Tensor, int64_t); - + static void execute(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction); static common::OpDispatcher &dispatcher(); }; @@ -18,4 +18,4 @@ Tensor scatter(Tensor input, int64_t dim, Tensor index, Tensor src, int64_t redu // In-place / 指定 Output 接口 void scatter_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src, int64_t reduction); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 7716487e1..823f269aa 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -26,6 +26,9 @@ #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" #include "infiniop/ops/flash_attention.h" +#include "infiniop/ops/flipud.h" +#include "infiniop/ops/float_power.h" +#include "infiniop/ops/floor_divide.h" #include "infiniop/ops/fmin.h" #include "infiniop/ops/fmod.h" #include "infiniop/ops/gelu.h" @@ -38,6 +41,7 @@ #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/mul.h" +#include "infiniop/ops/multi_margin_loss.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" @@ -50,6 +54,7 @@ #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/scatter.h" #include "infiniop/ops/sigmoid.h" #include "infiniop/ops/silu.h" #include "infiniop/ops/silu_and_mul.h" @@ -60,11 +65,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topk.h" -#include "infiniop/ops/floor_divide.h" -#include "infiniop/ops/float_power.h" -#include "infiniop/ops/flipud.h" -#include "infiniop/ops/scatter.h" -#include "infiniop/ops/multi_margin_loss.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/var.h" diff --git a/include/infiniop/ops/flipud.h b/include/infiniop/ops/flipud.h index 6ff33c17c..6f5a4768d 100644 --- a/include/infiniop/ops/flipud.h +++ b/include/infiniop/ops/flipud.h @@ -5,23 +5,23 @@ typedef struct InfiniopDescriptor *infiniopFlipudDescriptor_t; -__C __export infiniStatus_t infiniopCreateFlipudDescriptor(infiniopHandle_t handle, - infiniopFlipudDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); +__INFINI_C __export infiniStatus_t infiniopCreateFlipudDescriptor(infiniopHandle_t handle, + infiniopFlipudDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); // 获取工作空间大小 -__C __export infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size); // 执行 Flipud 算子 -__C __export infiniStatus_t infiniopFlipud(infiniopFlipudDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); +__INFINI_C __export infiniStatus_t infiniopFlipud(infiniopFlipudDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); // 销毁描述符 -__C __export infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc); -#endif // __INFINIOP_FLIPUD_API_H__ \ No newline at end of file +#endif // __INFINIOP_FLIPUD_API_H__ diff --git a/include/infiniop/ops/float_power.h b/include/infiniop/ops/float_power.h index 5d8fb9bf5..8a4c9a97e 100644 --- a/include/infiniop/ops/float_power.h +++ b/include/infiniop/ops/float_power.h @@ -5,23 +5,23 @@ typedef struct InfiniopDescriptor *infiniopFloatPowerDescriptor_t; -__C __export infiniStatus_t infiniopCreateFloatPowerDescriptor(infiniopHandle_t handle, - infiniopFloatPowerDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, - float scalar_exponent); +__INFINI_C __export infiniStatus_t infiniopCreateFloatPowerDescriptor(infiniopHandle_t handle, + infiniopFloatPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t exponent, + float scalar_exponent); -__C __export infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopFloatPower(infiniopFloatPowerDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *exponent, - void *stream); +__INFINI_C __export infiniStatus_t infiniopFloatPower(infiniopFloatPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *exponent, + void *stream); -__C __export infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/floor_divide.h b/include/infiniop/ops/floor_divide.h index 4b59a52e5..300290ede 100644 --- a/include/infiniop/ops/floor_divide.h +++ b/include/infiniop/ops/floor_divide.h @@ -5,22 +5,22 @@ typedef struct InfiniopDescriptor *infiniopFloorDivideDescriptor_t; -__C __export infiniStatus_t infiniopCreateFloorDivideDescriptor(infiniopHandle_t handle, - infiniopFloorDivideDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); +__INFINI_C __export infiniStatus_t infiniopCreateFloorDivideDescriptor(infiniopHandle_t handle, + infiniopFloorDivideDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); -__C __export infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopFloorDivide(infiniopFloorDivideDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); +__INFINI_C __export infiniStatus_t infiniopFloorDivide(infiniopFloorDivideDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); -__C __export infiniStatus_t infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/multi_margin_loss.h b/include/infiniop/ops/multi_margin_loss.h index cc4f9f0eb..9789e2ae4 100644 --- a/include/infiniop/ops/multi_margin_loss.h +++ b/include/infiniop/ops/multi_margin_loss.h @@ -4,27 +4,27 @@ #include "../operator_descriptor.h" typedef struct InfiniopDescriptor *infiniopMultiMarginLossDescriptor_t; -__C __export infiniStatus_t infiniopCreateMultiMarginLossDescriptor(infiniopHandle_t handle, - infiniopMultiMarginLossDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - infiniopTensorDescriptor_t target, - infiniopTensorDescriptor_t weight, - int p, - float margin, - int reduction); +__INFINI_C __export infiniStatus_t infiniopCreateMultiMarginLossDescriptor(infiniopHandle_t handle, + infiniopMultiMarginLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + infiniopTensorDescriptor_t weight, + int p, + float margin, + int reduction); -__C __export infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopMultiMarginLoss(infiniopMultiMarginLossDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - const void *target, - const void *weight, - void *stream); +__INFINI_C __export infiniStatus_t infiniopMultiMarginLoss(infiniopMultiMarginLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + const void *weight, + void *stream); -__C __export infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc); -#endif // __INFINIOP_MULTI_MARGIN_LOSS_API_H__ \ No newline at end of file +#endif // __INFINIOP_MULTI_MARGIN_LOSS_API_H__ diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h index d2b6b992b..2fa22c534 100644 --- a/include/infiniop/ops/scatter.h +++ b/include/infiniop/ops/scatter.h @@ -5,26 +5,26 @@ typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t; -__C __export infiniStatus_t infiniopCreateScatterDescriptor(infiniopHandle_t handle, - infiniopScatterDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - infiniopTensorDescriptor_t indices, - infiniopTensorDescriptor_t updates, - int axis, - int reduction); +__INFINI_C __export infiniStatus_t infiniopCreateScatterDescriptor(infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + infiniopTensorDescriptor_t updates, + int axis, + int reduction); -__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - const void *indices, - const void *updates, - void *stream); +__INFINI_C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *indices, + const void *updates, + void *stream); -__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); -#endif // __INFINIOP_SCATTER_API_H__ \ No newline at end of file +#endif // __INFINIOP_SCATTER_API_H__ diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index b16f847a5..7217f3e2d 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -65,6 +65,9 @@ from infinicore.ops.cdist import cdist from infinicore.ops.cross_entropy import cross_entropy from infinicore.ops.equal import equal +from infinicore.ops.flipud import flipud +from infinicore.ops.float_power import float_power +from infinicore.ops.floor_divide import floor_divide from infinicore.ops.fmin import fmin from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching @@ -78,9 +81,6 @@ from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal -from infinicore.ops.floor_divide import floor_divide -from infinicore.ops.float_power import float_power -from infinicore.ops.flipud import flipud from infinicore.ops.scatter import scatter from infinicore.ops.squeeze import squeeze from infinicore.ops.sum import sum diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 1fbacbb60..d8da961b3 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -9,13 +9,14 @@ from .hardtanh import hardtanh from .linear import linear from .linear_w8a8i8 import linear_w8a8i8 +from .multi_margin_loss import multi_margin_loss from .random_sample import random_sample from .rms_norm import rms_norm from .rope import RopeAlgo, rope from .silu import silu from .silu_and_mul import silu_and_mul from .swiglu import swiglu -from .multi_margin_loss import multi_margin_loss + __all__ = [ "adaptive_max_pool1d", "causal_softmax", diff --git a/python/infinicore/nn/functional/multi_margin_loss.py b/python/infinicore/nn/functional/multi_margin_loss.py index f06bb1be7..f1269543c 100644 --- a/python/infinicore/nn/functional/multi_margin_loss.py +++ b/python/infinicore/nn/functional/multi_margin_loss.py @@ -1,4 +1,5 @@ from typing import Optional + from infinicore.lib import _infinicore from infinicore.tensor import Tensor @@ -59,4 +60,4 @@ def multi_margin_loss( margin, reduction_val ) - ) \ No newline at end of file + ) diff --git a/python/infinicore/ops/flipud.py b/python/infinicore/ops/flipud.py index bdb01ea69..aafe93f89 100644 --- a/python/infinicore/ops/flipud.py +++ b/python/infinicore/ops/flipud.py @@ -1,7 +1,9 @@ from typing import Optional + from infinicore.lib import _infinicore from infinicore.tensor import Tensor + def flipud( input: Tensor, *, @@ -25,4 +27,4 @@ def flipud( return out return Tensor( _infinicore.flipud(input._underlying) - ) \ No newline at end of file + ) diff --git a/python/infinicore/ops/float_power.py b/python/infinicore/ops/float_power.py index f67b7ac58..033c99d57 100644 --- a/python/infinicore/ops/float_power.py +++ b/python/infinicore/ops/float_power.py @@ -1,7 +1,9 @@ from typing import Optional + from infinicore.lib import _infinicore from infinicore.tensor import Tensor + def float_power( input: Tensor, exponent: float, @@ -45,4 +47,4 @@ def float_power( input._underlying, exponent ) - ) \ No newline at end of file + ) diff --git a/python/infinicore/ops/floor_divide.py b/python/infinicore/ops/floor_divide.py index 1d76e0c05..1489fde08 100644 --- a/python/infinicore/ops/floor_divide.py +++ b/python/infinicore/ops/floor_divide.py @@ -8,4 +8,4 @@ def floor_divide(input, other, *, out=None): _infinicore.floor_divide_(out._underlying, input._underlying, other._underlying) - return out \ No newline at end of file + return out diff --git a/python/infinicore/ops/scatter.py b/python/infinicore/ops/scatter.py index bc4a6c969..3a3654027 100644 --- a/python/infinicore/ops/scatter.py +++ b/python/infinicore/ops/scatter.py @@ -1,4 +1,5 @@ from typing import Optional + from infinicore.lib import _infinicore from infinicore.tensor import Tensor @@ -53,4 +54,4 @@ def scatter( dim, # dim (第4个) reduction_val ) - ) \ No newline at end of file + ) diff --git a/src/infinicore/ops/flipud/flipud.cc b/src/infinicore/ops/flipud/flipud.cc index 3d1ea08fb..8a4178641 100644 --- a/src/infinicore/ops/flipud/flipud.cc +++ b/src/infinicore/ops/flipud/flipud.cc @@ -16,7 +16,7 @@ Tensor flipud(Tensor input) { // Flipud 操作不改变张量的形状和数据类型 // Output shape == Input shape auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); - + flipud_(output, input); return output; } @@ -24,4 +24,4 @@ void flipud_(Tensor output, Tensor input) { Flipud::execute(output, input); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/flipud/flipud_infiniop.cc b/src/infinicore/ops/flipud/flipud_infiniop.cc index eaf5651ce..9162fef6b 100644 --- a/src/infinicore/ops/flipud/flipud_infiniop.cc +++ b/src/infinicore/ops/flipud/flipud_infiniop.cc @@ -31,12 +31,11 @@ void calculate(Tensor output, Tensor input) { // 2. 获取或创建描述符 if (!desc_opt) { INFINICORE_CHECK_ERROR(infiniopCreateFlipudDescriptor( - context::getInfiniopHandle(output->device()), + context::getInfiniopHandle(output->device()), &desc, - output->desc(), - input->desc() - )); - + output->desc(), + input->desc())); + cache.put(seed, desc); } else { desc = *desc_opt; @@ -45,13 +44,12 @@ void calculate(Tensor output, Tensor input) { INFINICORE_CHECK_ERROR(infiniopGetFlipudWorkspaceSize(desc, &workspace_size)); std::shared_ptr workspace = context::allocateMemory(workspace_size); INFINICORE_CHECK_ERROR(infiniopFlipud( - desc, - workspace->data(), + desc, + workspace->data(), workspace_size, - output->data(), - input->data(), - context::getStream() - )); + output->data(), + input->data(), + context::getStream())); } static bool registered = []() { @@ -59,4 +57,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::flipud_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::flipud_impl::infiniop diff --git a/src/infinicore/ops/float_power/float_power.cc b/src/infinicore/ops/float_power/float_power.cc index 9ef406e16..c3bf5003b 100644 --- a/src/infinicore/ops/float_power/float_power.cc +++ b/src/infinicore/ops/float_power/float_power.cc @@ -1,5 +1,5 @@ #include "infinicore/ops/float_power.hpp" -#include "infinicore/tensor.hpp" +#include "infinicore/tensor.hpp" namespace infinicore::op { @@ -7,12 +7,12 @@ namespace infinicore::op { // 1. Dispatcher 单例 // ======================================================================= -common::OpDispatcher& FloatPower::dispatcher_scalar() { +common::OpDispatcher &FloatPower::dispatcher_scalar() { static common::OpDispatcher dispatcher_; return dispatcher_; } -common::OpDispatcher& FloatPower::dispatcher_tensor() { +common::OpDispatcher &FloatPower::dispatcher_tensor() { static common::OpDispatcher dispatcher_; return dispatcher_; } @@ -38,21 +38,19 @@ void FloatPower::execute(Tensor output, Tensor input, Tensor exponent) { Tensor float_power(Tensor input, double exponent) { auto output = Tensor::empty( input->shape(), - infinicore::DataType::F64, - input->device() - ); + infinicore::DataType::F64, + input->device()); float_power_(output, input, exponent); return output; } Tensor float_power(Tensor input, Tensor exponent) { - Shape output_shape = input->shape(); + Shape output_shape = input->shape(); auto output = Tensor::empty( output_shape, - infinicore::DataType::F64, - input->device() - ); + infinicore::DataType::F64, + input->device()); float_power_(output, input, exponent); return output; @@ -70,4 +68,4 @@ void float_power_(Tensor output, Tensor input, Tensor exponent) { FloatPower::execute(output, input, exponent); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/float_power/float_power_infiniop.cc b/src/infinicore/ops/float_power/float_power_infiniop.cc index 4bb47a655..e07c1fd02 100644 --- a/src/infinicore/ops/float_power/float_power_infiniop.cc +++ b/src/infinicore/ops/float_power/float_power_infiniop.cc @@ -18,8 +18,7 @@ thread_local common::OpCache caches( infiniopDestroyFloatPowerDescriptor(desc)); desc = nullptr; } - } -); + }); // ======================================================================= // 1. Scalar Exponent @@ -27,12 +26,11 @@ thread_local common::OpCache caches( void calculate_scalar(Tensor output, Tensor input, - double exponent) -{ + double exponent) { // Hash: output / input meta + double exponent size_t seed = hash_combine(output, input, exponent); - auto device_type = context::getDevice().getType(); + auto device_type = context::getDevice().getType(); auto device_index = context::getDevice().getIndex(); auto &cache = caches.getCache(device_type, device_index); @@ -46,10 +44,8 @@ void calculate_scalar(Tensor output, &desc, output->desc(), input->desc(), - nullptr, // exponent tensor descriptor = null - static_cast(exponent) - ) - ); + nullptr, // exponent tensor descriptor = null + static_cast(exponent))); cache.put(seed, desc); } else { desc = *desc_opt; @@ -59,8 +55,7 @@ void calculate_scalar(Tensor output, INFINICORE_CHECK_ERROR( infiniopGetFloatPowerWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = - context::allocateMemory(workspace_size); + std::shared_ptr workspace = context::allocateMemory(workspace_size); INFINICORE_CHECK_ERROR( infiniopFloatPower( @@ -70,9 +65,7 @@ void calculate_scalar(Tensor output, output->data(), input->data(), nullptr, // exponent data pointer = null - context::getStream() - ) - ); + context::getStream())); } // ======================================================================= @@ -81,11 +74,10 @@ void calculate_scalar(Tensor output, void calculate_tensor(Tensor output, Tensor input, - Tensor exponent) -{ + Tensor exponent) { size_t seed = hash_combine(output, input, exponent); - auto device_type = context::getDevice().getType(); + auto device_type = context::getDevice().getType(); auto device_index = context::getDevice().getIndex(); auto &cache = caches.getCache(device_type, device_index); @@ -101,8 +93,7 @@ void calculate_tensor(Tensor output, input->desc(), exponent->desc(), // tensor exponent 0.0f // scalar ignored - ) - ); + )); cache.put(seed, desc); } else { desc = *desc_opt; @@ -112,8 +103,7 @@ void calculate_tensor(Tensor output, INFINICORE_CHECK_ERROR( infiniopGetFloatPowerWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = - context::allocateMemory(workspace_size); + std::shared_ptr workspace = context::allocateMemory(workspace_size); INFINICORE_CHECK_ERROR( infiniopFloatPower( @@ -123,9 +113,7 @@ void calculate_tensor(Tensor output, output->data(), input->data(), exponent->data(), - context::getStream() - ) - ); + context::getStream())); } // ======================================================================= diff --git a/src/infinicore/ops/floor_divide/floor_divide.cc b/src/infinicore/ops/floor_divide/floor_divide.cc index 9a1ed9d33..3dc87f3f2 100644 --- a/src/infinicore/ops/floor_divide/floor_divide.cc +++ b/src/infinicore/ops/floor_divide/floor_divide.cc @@ -24,4 +24,4 @@ void floor_divide_(Tensor c, Tensor a, Tensor b) { FloorDivide::execute(c, a, b); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc b/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc index f4caeeb79..332552b5f 100644 --- a/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc +++ b/src/infinicore/ops/floor_divide/floor_divide_infiniop.cc @@ -1,7 +1,7 @@ #include "../../utils.hpp" #include "infinicore/common/hash.hpp" -#include "infinicore/ops/floor_divide.hpp" #include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/floor_divide.hpp" #include namespace infinicore::op::floor_divide_impl::infiniop { @@ -49,4 +49,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::floor_divide_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::floor_divide_impl::infiniop diff --git a/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc b/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc index 31bd9f3f6..c7d2c21ee 100644 --- a/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc +++ b/src/infinicore/ops/multi_margin_loss/multi_margin_loss.cc @@ -17,12 +17,12 @@ Tensor multi_margin_loss(Tensor input, Tensor target, Tensor weight, int64_t p, Shape output_shape; if (reduction == 0) { // None output_shape = {input->shape()[0]}; - } else { + } else { output_shape = {}; // Scalar } auto output = Tensor::empty(output_shape, input->dtype(), input->device()); - + multi_margin_loss_(output, input, target, weight, p, margin, reduction); return output; } @@ -31,4 +31,4 @@ void multi_margin_loss_(Tensor output, Tensor input, Tensor target, Tensor weigh MultiMarginLoss::execute(output, input, target, weight, p, margin, reduction); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc b/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc index 0ae3a1590..4214bfda0 100644 --- a/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc +++ b/src/infinicore/ops/multi_margin_loss/multi_margin_loss_infiniop.cc @@ -33,8 +33,8 @@ void calculate(Tensor output, Tensor input, Tensor target, Tensor weight, int64_ auto desc_opt = cache.get(seed); infiniopMultiMarginLossDescriptor_t desc = nullptr; infiniopTensorDescriptor_t weight_desc = nullptr; - const void* weight_data = nullptr; - + const void *weight_data = nullptr; + if (has_weight) { weight_desc = weight->desc(); weight_data = weight->data(); @@ -43,17 +43,16 @@ void calculate(Tensor output, Tensor input, Tensor target, Tensor weight, int64_ if (!desc_opt) { // 创建描述符 INFINICORE_CHECK_ERROR(infiniopCreateMultiMarginLossDescriptor( - context::getInfiniopHandle(output->device()), + context::getInfiniopHandle(output->device()), &desc, - output->desc(), - input->desc(), - target->desc(), - weight_desc, + output->desc(), + input->desc(), + target->desc(), + weight_desc, static_cast(p), margin, - static_cast(reduction) - )); - + static_cast(reduction))); + cache.put(seed, desc); } else { desc = *desc_opt; @@ -65,15 +64,14 @@ void calculate(Tensor output, Tensor input, Tensor target, Tensor weight, int64_ std::shared_ptr workspace = context::allocateMemory(workspace_size); INFINICORE_CHECK_ERROR(infiniopMultiMarginLoss( - desc, - workspace->data(), + desc, + workspace->data(), workspace_size, - output->data(), - input->data(), - target->data(), + output->data(), + input->data(), + target->data(), weight_data, - context::getStream() - )); + context::getStream())); } static bool registered = []() { @@ -81,4 +79,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::multi_margin_loss_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::multi_margin_loss_impl::infiniop diff --git a/src/infinicore/ops/scatter/scatter.cc b/src/infinicore/ops/scatter/scatter.cc index 3abd8542a..402baf223 100644 --- a/src/infinicore/ops/scatter/scatter.cc +++ b/src/infinicore/ops/scatter/scatter.cc @@ -15,7 +15,7 @@ Tensor scatter(Tensor input, int64_t dim, Tensor index, Tensor src, int64_t redu // 创建与 input 形状、数据类型、设备一致的 Output Tensor auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); scatter_(output, input, dim, index, src, reduction); - + return output; } @@ -23,4 +23,4 @@ void scatter_(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor src Scatter::execute(output, input, dim, index, src, reduction); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/scatter/scatter_infiniop.cc b/src/infinicore/ops/scatter/scatter_infiniop.cc index 8125907b6..3f080b6d7 100644 --- a/src/infinicore/ops/scatter/scatter_infiniop.cc +++ b/src/infinicore/ops/scatter/scatter_infiniop.cc @@ -33,16 +33,15 @@ void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor sr // C++ Op 参数: output, input, dim, index, src, reduction // C API 参数: output, input, indices, updates, axis, reduction INFINICORE_CHECK_ERROR(infiniopCreateScatterDescriptor( - context::getInfiniopHandle(output->device()), + context::getInfiniopHandle(output->device()), &desc, - output->desc(), - input->desc(), + output->desc(), + input->desc(), index->desc(), // 对应 C API indices src->desc(), // 对应 C API updates static_cast(dim), - static_cast(reduction) - )); - + static_cast(reduction))); + cache.put(seed, desc); } else { desc = *desc_opt; @@ -54,15 +53,14 @@ void calculate(Tensor output, Tensor input, int64_t dim, Tensor index, Tensor sr std::shared_ptr workspace = context::allocateMemory(workspace_size); INFINICORE_CHECK_ERROR(infiniopScatter( - desc, - workspace->data(), + desc, + workspace->data(), workspace_size, - output->data(), - input->data(), - index->data(), + output->data(), + input->data(), + index->data(), src->data(), - context::getStream() - )); + context::getStream())); } static bool registered = []() { @@ -70,4 +68,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::scatter_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::scatter_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 2dbf6299b..747cff4ea 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -24,6 +24,9 @@ #include "ops/embedding.hpp" #include "ops/equal.hpp" #include "ops/flash_attention.hpp" +#include "ops/flipud.hpp" +#include "ops/float_power.hpp" +#include "ops/floor_divide.hpp" #include "ops/fmin.hpp" #include "ops/fmod.hpp" #include "ops/hardswish.hpp" @@ -35,6 +38,7 @@ #include "ops/mha_kvcache.hpp" #include "ops/mha_varlen.hpp" #include "ops/mul.hpp" +#include "ops/multi_margin_loss.hpp" #include "ops/paged_attention.hpp" #include "ops/paged_attention_prefill.hpp" #include "ops/paged_caching.hpp" @@ -43,6 +47,7 @@ #include "ops/reciprocal.hpp" #include "ops/rms_norm.hpp" #include "ops/rope.hpp" +#include "ops/scatter.hpp" #include "ops/silu.hpp" #include "ops/silu_and_mul.hpp" #include "ops/sum.hpp" @@ -50,11 +55,6 @@ #include "ops/topk.hpp" #include "ops/var.hpp" #include "ops/var_mean.hpp" -#include "ops/floor_divide.hpp" -#include "ops/float_power.hpp" -#include "ops/flipud.hpp" -#include "ops/multi_margin_loss.hpp" -#include "ops/scatter.hpp" namespace py = pybind11; diff --git a/src/infinicore/pybind11/ops/flipud.hpp b/src/infinicore/pybind11/ops/flipud.hpp index 585bc636a..97c5641d6 100644 --- a/src/infinicore/pybind11/ops/flipud.hpp +++ b/src/infinicore/pybind11/ops/flipud.hpp @@ -1,7 +1,7 @@ #pragma once +#include "infinicore/ops/flipud.hpp" #include -#include "infinicore/ops/flipud.hpp" namespace py = pybind11; @@ -28,4 +28,4 @@ inline void bind_flipud(py::module &m) { R"doc(Explicit output FlipUD operation. Writes the result into the output tensor.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/float_power.hpp b/src/infinicore/pybind11/ops/float_power.hpp index 242626e9d..8ee3a5d77 100644 --- a/src/infinicore/pybind11/ops/float_power.hpp +++ b/src/infinicore/pybind11/ops/float_power.hpp @@ -1,6 +1,6 @@ #include "../tensor.hpp" -#include #include "infinicore/ops/float_power.hpp" +#include namespace py = pybind11; @@ -25,31 +25,35 @@ inline Tensor unwrap(py::handle obj) { void bind_float_power(py::module &m) { // --- Out-of-place: float_power(input, exponent) --- - m.def("float_power", [](py::object input_obj, py::object exp_obj) -> Tensor { - Tensor input = unwrap(input_obj); - - // 处理标量指数的情况 (float 或 int) - if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { - return float_power(input, exp_obj.cast()); - } - - // 处理张量指数的情况 - Tensor exponent = unwrap(exp_obj); - return float_power(input, exponent); - }, py::arg("input"), py::arg("exponent")); + m.def( + "float_power", [](py::object input_obj, py::object exp_obj) -> Tensor { + Tensor input = unwrap(input_obj); - // --- In-place: float_power_(out, input, exponent) --- - m.def("float_power_", [](py::object out_obj, py::object input_obj, py::object exp_obj) { - Tensor out = unwrap(out_obj); - Tensor input = unwrap(input_obj); - - if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { - float_power_(out, input, exp_obj.cast()); - } else { + // 处理标量指数的情况 (float 或 int) + if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { + return float_power(input, exp_obj.cast()); + } + + // 处理张量指数的情况 Tensor exponent = unwrap(exp_obj); - float_power_(out, input, exponent); - } - }, py::arg("out"), py::arg("input"), py::arg("exponent")); + return float_power(input, exponent); + }, + py::arg("input"), py::arg("exponent")); + + // --- In-place: float_power_(out, input, exponent) --- + m.def( + "float_power_", [](py::object out_obj, py::object input_obj, py::object exp_obj) { + Tensor out = unwrap(out_obj); + Tensor input = unwrap(input_obj); + + if (py::isinstance(exp_obj) || py::isinstance(exp_obj)) { + float_power_(out, input, exp_obj.cast()); + } else { + Tensor exponent = unwrap(exp_obj); + float_power_(out, input, exponent); + } + }, + py::arg("out"), py::arg("input"), py::arg("exponent")); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/floor_divide.hpp b/src/infinicore/pybind11/ops/floor_divide.hpp index 3bcec31ee..626767218 100644 --- a/src/infinicore/pybind11/ops/floor_divide.hpp +++ b/src/infinicore/pybind11/ops/floor_divide.hpp @@ -23,4 +23,4 @@ inline void bind_floor_divide(py::module &m) { R"doc(In-place tensor floor division.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/multi_margin_loss.hpp b/src/infinicore/pybind11/ops/multi_margin_loss.hpp index d55f0d723..f59fcec8e 100644 --- a/src/infinicore/pybind11/ops/multi_margin_loss.hpp +++ b/src/infinicore/pybind11/ops/multi_margin_loss.hpp @@ -1,16 +1,17 @@ #pragma once +#include "infinicore/ops/multi_margin_loss.hpp" #include -#include "infinicore/ops/multi_margin_loss.hpp" namespace py = pybind11; namespace infinicore::ops { inline void bind_multi_margin_loss(py::module &m) { - m.def("multi_margin_loss", - [](const Tensor& input, const Tensor& target, py::object weight, int p, float margin, int reduction) { - Tensor weight_tensor; + m.def( + "multi_margin_loss", + [](const Tensor &input, const Tensor &target, py::object weight, int p, float margin, int reduction) { + Tensor weight_tensor; if (!weight.is_none()) { weight_tensor = weight.cast(); } @@ -33,8 +34,9 @@ inline void bind_multi_margin_loss(py::module &m) { reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1. )doc"); - m.def("multi_margin_loss_", - [](Tensor& output, const Tensor& input, const Tensor& target, py::object weight, int p, float margin, int reduction) { + m.def( + "multi_margin_loss_", + [](Tensor &output, const Tensor &input, const Tensor &target, py::object weight, int p, float margin, int reduction) { Tensor weight_tensor; if (!weight.is_none()) { weight_tensor = weight.cast(); @@ -52,4 +54,4 @@ inline void bind_multi_margin_loss(py::module &m) { R"doc(Explicit output Multi Margin Loss operation. Writes the result into the output tensor.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/scatter.hpp b/src/infinicore/pybind11/ops/scatter.hpp index 149e4ba81..950ed49e8 100644 --- a/src/infinicore/pybind11/ops/scatter.hpp +++ b/src/infinicore/pybind11/ops/scatter.hpp @@ -1,7 +1,7 @@ #pragma once -#include #include "infinicore/ops/scatter.hpp" +#include namespace py = pybind11; @@ -14,15 +14,16 @@ inline void bind_scatter(py::module &m) { // 为了匹配测试脚本的行为(将所有 Tensor 作为位置参数传入,属性作为 kwargs 传入), // 我们将参数顺序调整为: input, index, src, dim, reduction // ========================================================================= - m.def("scatter", - [](const Tensor& input, const Tensor& index, const Tensor& src, int64_t dim, int64_t reduction) { + m.def( + "scatter", + [](const Tensor &input, const Tensor &index, const Tensor &src, int64_t dim, int64_t reduction) { // 调用底层 C++ 实现时,必须恢复正确的参数顺序: (input, dim, index, src, reduction) return op::scatter(input, dim, index, src, reduction); }, py::arg("input"), py::arg("index"), py::arg("src"), - py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 + py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 py::arg("reduction") = 0, R"doc( Scatter operator. @@ -34,8 +35,9 @@ inline void bind_scatter(py::module &m) { // ========================================================================= // 参数顺序调整为: output, input, index, src, dim, reduction // ========================================================================= - m.def("scatter_", - [](Tensor& output, const Tensor& input, const Tensor& index, const Tensor& src, int64_t dim, int64_t reduction) { + m.def( + "scatter_", + [](Tensor &output, const Tensor &input, const Tensor &index, const Tensor &src, int64_t dim, int64_t reduction) { // 调用底层 C++ 实现 op::scatter_(output, input, dim, index, src, reduction); }, @@ -43,7 +45,7 @@ inline void bind_scatter(py::module &m) { py::arg("input"), py::arg("index"), py::arg("src"), - py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 + py::arg("dim"), // 关键修改:将 dim 移到 Tensor 参数之后 py::arg("reduction") = 0, R"doc( In-place Scatter operator. @@ -51,4 +53,4 @@ inline void bind_scatter(py::module &m) { )doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infiniop/ops/flipud/cpu/flipud_cpu.cc b/src/infiniop/ops/flipud/cpu/flipud_cpu.cc index 911fbb22c..3b2045674 100644 --- a/src/infiniop/ops/flipud/cpu/flipud_cpu.cc +++ b/src/infiniop/ops/flipud/cpu/flipud_cpu.cc @@ -1,9 +1,9 @@ #include "flipud_cpu.h" #include "../../../devices/cpu/common_cpu.h" #include -#include -#include #include +#include +#include // 引用框架定义的 float16/bfloat16 类型支持 #include "../../../../utils/custom_types.h" @@ -14,9 +14,9 @@ namespace op::flipud::cpu { // 0. 定义 Opaque 结构体 // ================================================================== struct Descriptor::Opaque { - std::vector shape; - std::vector in_strides; - std::vector out_strides; + std::vector shape; + std::vector in_strides; + std::vector out_strides; int ndim; }; @@ -38,22 +38,22 @@ infiniStatus_t Descriptor::create( Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc) { - + auto handle = reinterpret_cast(handle_); - + // 1. 创建 Info auto result = FlipudInfo::create(out_desc, input_desc); CHECK_RESULT(result); - + // 2. 创建并填充 Opaque auto opaque = new Descriptor::Opaque(); opaque->ndim = static_cast(input_desc->ndim()); - - const auto& shape = input_desc->shape(); - const auto& in_strides = input_desc->strides(); - const auto& out_strides = out_desc->strides(); - for(int i = 0; i < opaque->ndim; ++i) { + const auto &shape = input_desc->shape(); + const auto &in_strides = input_desc->strides(); + const auto &out_strides = out_desc->strides(); + + for (int i = 0; i < opaque->ndim; ++i) { opaque->shape.push_back(shape[i]); opaque->in_strides.push_back(in_strides[i]); opaque->out_strides.push_back(out_strides[i]); @@ -63,10 +63,9 @@ infiniStatus_t Descriptor::create( *desc_ptr = new Descriptor( opaque, result.take(), - 0, - handle->device, - handle->device_id - ); + 0, + handle->device, + handle->device_id); return INFINI_STATUS_SUCCESS; } @@ -78,24 +77,24 @@ infiniStatus_t Descriptor::create( template void calculate_cpu_impl( int ndim, - const std::vector& shape, - const std::vector& in_strides, - const std::vector& out_strides, + const std::vector &shape, + const std::vector &in_strides, + const std::vector &out_strides, size_t numel, void *output, const void *input) { auto out_ptr = reinterpret_cast(output); auto in_ptr = reinterpret_cast(input); - + // 维度 0 的大小 int64_t dim0_size = shape[0]; - #pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) for (size_t i = 0; i < numel; ++i) { // --- A. 坐标反解 --- std::vector coords(ndim); - + size_t temp_idx = i; for (int d = ndim - 1; d >= 0; --d) { coords[d] = temp_idx % shape[d]; @@ -140,25 +139,25 @@ infiniStatus_t Descriptor::calculate( switch (dtype) { case INFINI_DTYPE_F32: cpu::calculate_cpu_impl( - _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, numel, output, input); break; - + case INFINI_DTYPE_F64: cpu::calculate_cpu_impl( - _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, numel, output, input); break; - + case INFINI_DTYPE_F16: cpu::calculate_cpu_impl( - _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, numel, output, input); break; - + case INFINI_DTYPE_BF16: cpu::calculate_cpu_impl( - _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, + _opaque->ndim, _opaque->shape, _opaque->in_strides, _opaque->out_strides, numel, output, input); break; default: @@ -168,4 +167,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::flipud::cpu \ No newline at end of file +} // namespace op::flipud::cpu diff --git a/src/infiniop/ops/flipud/cpu/flipud_cpu.h b/src/infiniop/ops/flipud/cpu/flipud_cpu.h index eff0b8020..a34908c30 100644 --- a/src/infiniop/ops/flipud/cpu/flipud_cpu.h +++ b/src/infiniop/ops/flipud/cpu/flipud_cpu.h @@ -5,4 +5,4 @@ DESCRIPTOR(cpu) -#endif // __FLIPUD_CPU_H__ \ No newline at end of file +#endif // __FLIPUD_CPU_H__ diff --git a/src/infiniop/ops/flipud/cuda/kernel.cuh b/src/infiniop/ops/flipud/cuda/kernel.cuh index e5d436921..2910b6df5 100644 --- a/src/infiniop/ops/flipud/cuda/kernel.cuh +++ b/src/infiniop/ops/flipud/cuda/kernel.cuh @@ -1,8 +1,8 @@ #ifndef __FLIPUD_CUDA_CUH__ #define __FLIPUD_CUDA_CUH__ -#include -#include #include +#include +#include #include @@ -22,18 +22,18 @@ struct TensorLayout { size_t out_strides[MAX_DIMS]; }; -__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout& layout, size_t* coords) { +__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout &layout, size_t *coords) { size_t temp = index; - #pragma unroll +#pragma unroll for (int i = layout.ndim - 1; i >= 0; --i) { coords[i] = temp % layout.shape[i]; temp /= layout.shape[i]; } } -__device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const size_t* strides, int ndim) { +__device__ __forceinline__ size_t coords_to_offset(const size_t *coords, const size_t *strides, int ndim) { size_t offset = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; } @@ -42,21 +42,21 @@ __device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const s template __global__ void flipud_kernel( - T * __restrict__ output, - const T * __restrict__ input, + T *__restrict__ output, + const T *__restrict__ input, size_t numel, TensorLayout layout) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < numel) { size_t coords[MAX_DIMS]; index_to_coords(idx, layout, coords); size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); - + coords[0] = layout.shape[0] - 1 - coords[0]; - + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); output[out_offset] = input[in_offset]; @@ -65,31 +65,30 @@ __global__ void flipud_kernel( template __global__ void flipud_kernel_vectorized( - T * __restrict__ output, - const T * __restrict__ input, + T *__restrict__ output, + const T *__restrict__ input, size_t num_packs, TensorLayout layout) { using PackType = Pack; size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { size_t scalar_idx = idx * PackSize; size_t coords[MAX_DIMS]; - + index_to_coords(scalar_idx, layout, coords); - + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); - + coords[0] = layout.shape[0] - 1 - coords[0]; - + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); - *reinterpret_cast(output + out_offset) = - *reinterpret_cast(input + in_offset); + *reinterpret_cast(output + out_offset) = *reinterpret_cast(input + in_offset); } } } // namespace op::flipud::cuda -#endif // __FLIPUD_CUDA_CUH__ \ No newline at end of file +#endif // __FLIPUD_CUDA_CUH__ diff --git a/src/infiniop/ops/flipud/flipud.h b/src/infiniop/ops/flipud/flipud.h index 87b83f5d4..22dafceed 100644 --- a/src/infiniop/ops/flipud/flipud.h +++ b/src/infiniop/ops/flipud/flipud.h @@ -2,47 +2,47 @@ #define __FLIPUD_H__ #include "../../operator.h" -#include "info.h" +#include "info.h" // 宏定义:用于生成不同命名空间下的 Descriptor 类 // 适配 Flipud 的单输入单输出模式 -#define DESCRIPTOR(NAMESPACE) \ - namespace op::flipud::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - FlipudInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - FlipudInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - infiniopTensorDescriptor_t input_desc); \ - \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - const void *input, \ - void *stream) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + namespace op::flipud::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + FlipudInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + FlipudInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ } -#endif // __FLIPUD_H__ \ No newline at end of file +#endif // __FLIPUD_H__ diff --git a/src/infiniop/ops/flipud/info.h b/src/infiniop/ops/flipud/info.h index 655bd91d4..1dd9fe813 100644 --- a/src/infiniop/ops/flipud/info.h +++ b/src/infiniop/ops/flipud/info.h @@ -19,7 +19,7 @@ class FlipudInfo { int ndim() const { return _ndim; } size_t numel() const { return _numel; } - FlipudInfo(int dtype, int ndim, size_t numel) + FlipudInfo(int dtype, int ndim, size_t numel) : _dtype(dtype), _ndim(ndim), _numel(numel) {} static utils::Result create( @@ -35,12 +35,12 @@ class FlipudInfo { } if (input_desc->ndim() < 1) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; + return INFINI_STATUS_BAD_TENSOR_SHAPE; } const auto &in_shape = input_desc->shape(); const auto &out_shape = out_desc->shape(); - + for (size_t i = 0; i < input_desc->ndim(); ++i) { if (in_shape[i] != out_shape[i]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -50,11 +50,10 @@ class FlipudInfo { return utils::Result(FlipudInfo{ input_desc->dtype(), static_cast(input_desc->ndim()), - input_desc->numel() - }); + input_desc->numel()}); } }; } // namespace op::flipud -#endif // __FLIPUD_INFO_H__ \ No newline at end of file +#endif // __FLIPUD_INFO_H__ diff --git a/src/infiniop/ops/flipud/metax/flipud_metax.h b/src/infiniop/ops/flipud/metax/flipud_metax.h index 5b8e66cab..2c4739c65 100644 --- a/src/infiniop/ops/flipud/metax/flipud_metax.h +++ b/src/infiniop/ops/flipud/metax/flipud_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif // __FLIPUD_METAX_API_H__ \ No newline at end of file +#endif // __FLIPUD_METAX_API_H__ diff --git a/src/infiniop/ops/flipud/metax/flipud_metax.maca b/src/infiniop/ops/flipud/metax/flipud_metax.maca index 0fb8e504d..47690cf13 100644 --- a/src/infiniop/ops/flipud/metax/flipud_metax.maca +++ b/src/infiniop/ops/flipud/metax/flipud_metax.maca @@ -1,12 +1,12 @@ -#include "flipud_metax.h" #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" -#include -#include +#include "flipud_metax.h" #include -#include -#include +#include #include +#include +#include +#include namespace op::flipud::metax { @@ -19,17 +19,27 @@ struct TensorLayout { int out_strides[MAX_DIMS]; }; -template struct VectorType; -template <> struct VectorType<16> { using type = int4; }; // 128-bit -template <> struct VectorType<8> { using type = int2; }; // 64-bit -template <> struct VectorType<4> { using type = int; }; // 32-bit +template +struct VectorType; +template <> +struct VectorType<16> { + using type = int4; +}; // 128-bit +template <> +struct VectorType<8> { + using type = int2; +}; // 64-bit +template <> +struct VectorType<4> { + using type = int; +}; // 32-bit // --------------------------------------------------------- -__device__ inline size_t get_offset(int idx, const int* strides, int ndim, const int* shape) { +__device__ inline size_t get_offset(int idx, const int *strides, int ndim, const int *shape) { size_t offset = 0; int rem = idx; - #pragma unroll +#pragma unroll for (int i = ndim - 1; i >= 0; --i) { int dim_sz = shape[i]; int pos = rem % dim_sz; @@ -39,15 +49,15 @@ __device__ inline size_t get_offset(int idx, const int* strides, int ndim, const return offset; } -__device__ inline size_t get_flipud_src_offset(int idx, const int* strides, int ndim, const int* shape) { +__device__ inline size_t get_flipud_src_offset(int idx, const int *strides, int ndim, const int *shape) { size_t offset = 0; int rem = idx; - #pragma unroll +#pragma unroll for (int i = ndim - 1; i >= 0; --i) { int dim_sz = shape[i]; int pos = rem % dim_sz; rem /= dim_sz; - + if (i == 0) { pos = dim_sz - 1 - pos; } @@ -59,10 +69,11 @@ __device__ inline size_t get_flipud_src_offset(int idx, const int* strides, int // 标量 Kernel template __global__ void flipud_kernel( - T* dst, const T* src, size_t n, TensorLayout layout) -{ + T *dst, const T *src, size_t n, TensorLayout layout) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= n) return; + if (idx >= n) { + return; + } size_t dst_off = get_offset(idx, layout.out_strides, layout.ndim, layout.shape); size_t src_off = get_flipud_src_offset(idx, layout.in_strides, layout.ndim, layout.shape); @@ -73,20 +84,21 @@ __global__ void flipud_kernel( // 向量化 Kernel template __global__ void flipud_kernel_vectorized( - T* dst, const T* src, size_t num_packs, TensorLayout layout) -{ + T *dst, const T *src, size_t num_packs, TensorLayout layout) { // [修正] 使用 int4/int2 替代 aligned_storage using VecT = typename VectorType::type; - + size_t pack_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (pack_idx >= num_packs) return; - + if (pack_idx >= num_packs) { + return; + } + int strides_in[MAX_DIMS], strides_out[MAX_DIMS], shape[MAX_DIMS]; - - #pragma unroll - for(int i=0; i 0) { - shape[layout.ndim-1] /= PackSize; + shape[layout.ndim - 1] /= PackSize; } size_t dst_pack_off = get_offset(pack_idx, strides_out, layout.ndim, shape); size_t src_pack_off = get_flipud_src_offset(pack_idx, strides_in, layout.ndim, shape); // 强转为向量类型进行读写 - const VecT* src_vec = reinterpret_cast(src); - VecT* dst_vec = reinterpret_cast(dst); + const VecT *src_vec = reinterpret_cast(src); + VecT *dst_vec = reinterpret_cast(dst); dst_vec[dst_pack_off] = src_vec[src_pack_off]; } @@ -129,22 +141,22 @@ void launch_kernel( auto in_ptr = reinterpret_cast(input); auto out_ptr = reinterpret_cast(output); auto mc_stream = reinterpret_cast(stream); - - constexpr int TotalBytes = 16; + + constexpr int TotalBytes = 16; constexpr int PackSize = TotalBytes / sizeof(T); - + // ---------------- Check Vectorization ---------------- bool is_ptr_aligned = is_pointer_aligned(output, TotalBytes) && is_pointer_aligned(input, TotalBytes); bool is_numel_divisible = (numel % PackSize == 0); - bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim - 1] % PackSize == 0); bool is_inner_contiguous = false; if (layout.ndim > 0) { - if (layout.in_strides[layout.ndim-1] == 1 && layout.out_strides[layout.ndim-1] == 1) { + if (layout.in_strides[layout.ndim - 1] == 1 && layout.out_strides[layout.ndim - 1] == 1) { is_inner_contiguous = true; } } - + bool is_stride_aligned = true; for (int i = 0; i < layout.ndim - 1; ++i) { if (layout.in_strides[i] % PackSize != 0 || layout.out_strides[i] % PackSize != 0) { @@ -156,32 +168,28 @@ void launch_kernel( // [逻辑正确] 1D Tensor 禁止向量化 bool is_dim_safe = (layout.ndim > 1); - bool can_vectorize = (PackSize > 1) && - is_ptr_aligned && - is_numel_divisible && - is_last_dim_aligned && - is_inner_contiguous && - is_stride_aligned && - is_dim_safe; + bool can_vectorize = (PackSize > 1) && is_ptr_aligned && is_numel_divisible && is_last_dim_aligned && is_inner_contiguous && is_stride_aligned && is_dim_safe; if (can_vectorize) { size_t num_packs = numel / PackSize; size_t block_size = 256; size_t grid_size = (num_packs + block_size - 1) / block_size; - + flipud_kernel_vectorized <<>>(out_ptr, in_ptr, num_packs, layout); } else { size_t block_size = 256; size_t grid_size = (numel + block_size - 1) / block_size; - + flipud_kernel <<>>(out_ptr, in_ptr, numel, layout); } } -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( @@ -190,19 +198,21 @@ infiniStatus_t Descriptor::create( auto handle = reinterpret_cast(handle_); auto info_result = FlipudInfo::create(out_desc, input_desc); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } auto opaque = new Opaque(); opaque->layout.ndim = static_cast(input_desc->ndim()); - + if (opaque->layout.ndim > MAX_DIMS) { delete opaque; return INFINI_STATUS_BAD_TENSOR_SHAPE; } - const auto& shape = input_desc->shape(); - const auto& in_strides = input_desc->strides(); - const auto& out_strides = out_desc->strides(); + const auto &shape = input_desc->shape(); + const auto &in_strides = input_desc->strides(); + const auto &out_strides = out_desc->strides(); for (int i = 0; i < opaque->layout.ndim; ++i) { opaque->layout.shape[i] = shape[i]; @@ -244,4 +254,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::flipud::metax \ No newline at end of file +} // namespace op::flipud::metax diff --git a/src/infiniop/ops/flipud/moore/flipud_moore.h b/src/infiniop/ops/flipud/moore/flipud_moore.h index ac76e968c..3fc1364c7 100644 --- a/src/infiniop/ops/flipud/moore/flipud_moore.h +++ b/src/infiniop/ops/flipud/moore/flipud_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif // __FLIPUD_MOORE_H__ \ No newline at end of file +#endif // __FLIPUD_MOORE_H__ diff --git a/src/infiniop/ops/flipud/moore/flipud_moore.mu b/src/infiniop/ops/flipud/moore/flipud_moore.mu index 44b1814e1..e2adfbe01 100644 --- a/src/infiniop/ops/flipud/moore/flipud_moore.mu +++ b/src/infiniop/ops/flipud/moore/flipud_moore.mu @@ -1,8 +1,8 @@ +#include "../../../devices/moore/moore_handle.h" #include "flipud_moore.h" #include "flipud_moore_kernel.h" -#include "../../../devices/moore/moore_handle.h" -#include #include +#include #include namespace op::flipud::moore { @@ -34,10 +34,10 @@ void launch_kernel( auto in_ptr = reinterpret_cast(input); auto out_ptr = reinterpret_cast(output); auto musa_stream = reinterpret_cast(stream); - + constexpr int TotalBytes = 16; // 128-bit constexpr int PackSize = TotalBytes / sizeof(T); - + // ------------------------------------------ // 向量化判定 (Vectorization Check) // ------------------------------------------ @@ -48,13 +48,11 @@ void launch_kernel( bool is_numel_divisible = (numel % PackSize == 0); // 3. 最后一维大小必须是 PackSize 的倍数 (保证 Pack 不会跨行读取) - bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim - 1] % PackSize == 0); // 4. 连续性条件:维度 > 1 且 最内层在内存中是连续的 (stride=1) - bool is_inner_contiguous = (layout.ndim > 1) && - (layout.in_strides[layout.ndim-1] == 1) && - (layout.out_strides[layout.ndim-1] == 1); - + bool is_inner_contiguous = (layout.ndim > 1) && (layout.in_strides[layout.ndim - 1] == 1) && (layout.out_strides[layout.ndim - 1] == 1); + // 5. 步长对齐条件: 除非是最内层维度,否则所有 Stride 都必须是 PackSize 的倍数 // 这样保证每个 Pack 读取的起始地址都是对齐的 bool is_stride_aligned = true; @@ -65,24 +63,19 @@ void launch_kernel( } } - bool can_vectorize = (PackSize > 1) && - is_ptr_aligned && - is_numel_divisible && - is_last_dim_aligned && - is_inner_contiguous && - is_stride_aligned; + bool can_vectorize = (PackSize > 1) && is_ptr_aligned && is_numel_divisible && is_last_dim_aligned && is_inner_contiguous && is_stride_aligned; if (can_vectorize) { size_t num_packs = numel / PackSize; size_t block_size = 256; size_t grid_size = (num_packs + block_size - 1) / block_size; - + op::flipud::moore::flipud_kernel_vectorized <<>>(out_ptr, in_ptr, num_packs, layout); } else { size_t block_size = 256; size_t grid_size = (numel + block_size - 1) / block_size; - + op::flipud::moore::flipud_kernel <<>>(out_ptr, in_ptr, numel, layout); } @@ -91,8 +84,10 @@ void launch_kernel( // ================================================================== // Descriptor 实现 // ================================================================== -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( @@ -102,19 +97,21 @@ infiniStatus_t Descriptor::create( auto handle = reinterpret_cast(handle_); auto info_result = FlipudInfo::create(out_desc, input_desc); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } auto opaque = new Opaque(); opaque->layout.ndim = static_cast(input_desc->ndim()); - + if (opaque->layout.ndim > op::flipud::moore::MAX_DIMS) { delete opaque; return INFINI_STATUS_BAD_TENSOR_SHAPE; } - const auto& shape = input_desc->shape(); - const auto& in_strides = input_desc->strides(); - const auto& out_strides = out_desc->strides(); + const auto &shape = input_desc->shape(); + const auto &in_strides = input_desc->strides(); + const auto &out_strides = out_desc->strides(); for (int i = 0; i < opaque->layout.ndim; ++i) { opaque->layout.shape[i] = shape[i]; @@ -153,4 +150,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::flipud::moore \ No newline at end of file +} // namespace op::flipud::moore diff --git a/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h b/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h index 8a5c65ef4..616d7efc6 100644 --- a/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h +++ b/src/infiniop/ops/flipud/moore/flipud_moore_kernel.h @@ -1,10 +1,10 @@ #ifndef __FLIPUD_MOORE_KERNEL_H__ #define __FLIPUD_MOORE_KERNEL_H__ -#include -#include -#include #include +#include +#include +#include namespace op::flipud::moore { @@ -22,18 +22,18 @@ struct TensorLayout { size_t out_strides[MAX_DIMS]; }; -__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout& layout, size_t* coords) { +__device__ __forceinline__ void index_to_coords(size_t index, const TensorLayout &layout, size_t *coords) { size_t temp = index; - #pragma unroll +#pragma unroll for (int i = layout.ndim - 1; i >= 0; --i) { coords[i] = temp % layout.shape[i]; temp /= layout.shape[i]; } } -__device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const size_t* strides, int ndim) { +__device__ __forceinline__ size_t coords_to_offset(const size_t *coords, const size_t *strides, int ndim) { size_t offset = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; } @@ -42,22 +42,22 @@ __device__ __forceinline__ size_t coords_to_offset(const size_t* coords, const s template __global__ void flipud_kernel( - T * __restrict__ output, - const T * __restrict__ input, + T *__restrict__ output, + const T *__restrict__ input, size_t numel, TensorLayout layout) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < numel) { size_t coords[MAX_DIMS]; index_to_coords(idx, layout, coords); size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); - + // Flip dimension 0 coords[0] = layout.shape[0] - 1 - coords[0]; - + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); output[out_offset] = input[in_offset]; @@ -66,32 +66,31 @@ __global__ void flipud_kernel( template __global__ void flipud_kernel_vectorized( - T * __restrict__ output, - const T * __restrict__ input, + T *__restrict__ output, + const T *__restrict__ input, size_t num_packs, TensorLayout layout) { using PackType = Pack; size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { size_t scalar_idx = idx * PackSize; size_t coords[MAX_DIMS]; - + index_to_coords(scalar_idx, layout, coords); - + size_t out_offset = coords_to_offset(coords, layout.out_strides, layout.ndim); - + // Flip dimension 0 coords[0] = layout.shape[0] - 1 - coords[0]; - + size_t in_offset = coords_to_offset(coords, layout.in_strides, layout.ndim); - *reinterpret_cast(output + out_offset) = - *reinterpret_cast(input + in_offset); + *reinterpret_cast(output + out_offset) = *reinterpret_cast(input + in_offset); } } } // namespace op::flipud::moore -#endif // __FLIPUD_MOORE_KERNEL_H__ \ No newline at end of file +#endif // __FLIPUD_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu index 71e8e7d4b..33ca79ab0 100644 --- a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu +++ b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cu @@ -1,8 +1,8 @@ -#include "flipud_nvidia.cuh" -#include "../cuda/kernel.cuh" #include "../../../handle.h" -#include +#include "../cuda/kernel.cuh" +#include "flipud_nvidia.cuh" #include +#include #include namespace op::flipud::nvidia { @@ -37,25 +37,22 @@ void launch_kernel( auto in_ptr = reinterpret_cast(input); auto out_ptr = reinterpret_cast(output); auto cuda_stream = reinterpret_cast(stream); - + constexpr int TotalBytes = 16; // 128-bit constexpr int PackSize = TotalBytes / sizeof(T); - + // ------------------------------------------ // 向量化判定 (Vectorization Check) // ------------------------------------------ bool is_ptr_aligned = is_pointer_aligned(output, TotalBytes) && is_pointer_aligned(input, TotalBytes); - bool is_numel_divisible = (numel % PackSize == 0); - bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim-1] % PackSize == 0); + bool is_last_dim_aligned = (layout.ndim > 0) && (layout.shape[layout.ndim - 1] % PackSize == 0); // 4. 连续性条件:维度 > 1 且 最内层连续 - bool is_inner_contiguous = (layout.ndim > 1) && - (layout.in_strides[layout.ndim-1] == 1) && - (layout.out_strides[layout.ndim-1] == 1); - + bool is_inner_contiguous = (layout.ndim > 1) && (layout.in_strides[layout.ndim - 1] == 1) && (layout.out_strides[layout.ndim - 1] == 1); + // 5. 步长对齐条件 bool is_stride_aligned = true; for (int i = 0; i < layout.ndim - 1; ++i) { @@ -65,24 +62,19 @@ void launch_kernel( } } - bool can_vectorize = (PackSize > 1) && - is_ptr_aligned && - is_numel_divisible && - is_last_dim_aligned && - is_inner_contiguous && - is_stride_aligned; + bool can_vectorize = (PackSize > 1) && is_ptr_aligned && is_numel_divisible && is_last_dim_aligned && is_inner_contiguous && is_stride_aligned; if (can_vectorize) { size_t num_packs = numel / PackSize; size_t block_size = 256; size_t grid_size = (num_packs + block_size - 1) / block_size; - + op::flipud::cuda::flipud_kernel_vectorized <<>>(out_ptr, in_ptr, num_packs, layout); } else { size_t block_size = 256; size_t grid_size = (numel + block_size - 1) / block_size; - + op::flipud::cuda::flipud_kernel <<>>(out_ptr, in_ptr, numel, layout); } @@ -91,8 +83,10 @@ void launch_kernel( // ================================================================== // Descriptor 实现 // ================================================================== -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( @@ -100,19 +94,21 @@ infiniStatus_t Descriptor::create( infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc) { auto info_result = FlipudInfo::create(out_desc, input_desc); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } auto opaque = new Opaque(); opaque->layout.ndim = static_cast(input_desc->ndim()); - + if (opaque->layout.ndim > op::flipud::cuda::MAX_DIMS) { delete opaque; return INFINI_STATUS_BAD_TENSOR_SHAPE; } - const auto& shape = input_desc->shape(); - const auto& in_strides = input_desc->strides(); - const auto& out_strides = out_desc->strides(); + const auto &shape = input_desc->shape(); + const auto &in_strides = input_desc->strides(); + const auto &out_strides = out_desc->strides(); for (int i = 0; i < opaque->layout.ndim; ++i) { opaque->layout.shape[i] = shape[i]; @@ -152,4 +148,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::flipud::nvidia \ No newline at end of file +} // namespace op::flipud::nvidia diff --git a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh index 2b5396112..cef257731 100644 --- a/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh +++ b/src/infiniop/ops/flipud/nvidia/flipud_nvidia.cuh @@ -4,4 +4,4 @@ #include "../flipud.h" DESCRIPTOR(nvidia) -#endif // __FLIPUD_NVIDIA_CUH__ \ No newline at end of file +#endif // __FLIPUD_NVIDIA_CUH__ diff --git a/src/infiniop/ops/flipud/operator.cc b/src/infiniop/ops/flipud/operator.cc index 0d6359b7e..f9d384980 100644 --- a/src/infiniop/ops/flipud/operator.cc +++ b/src/infiniop/ops/flipud/operator.cc @@ -23,84 +23,84 @@ extern "C" { // ======================================================================= // 1. 创建算子描述符 // ======================================================================= -__C infiniStatus_t infiniopCreateFlipudDescriptor( +__INFINI_C infiniStatus_t infiniopCreateFlipudDescriptor( infiniopHandle_t handle, infiniopFlipudDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output, infiniopTensorDescriptor_t input) { - #define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::flipud::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output, \ - input) +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::flipud::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input) switch (handle->device) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CREATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CREATE +#undef CREATE } // ======================================================================= // 2. 获取 Workspace 大小 // ======================================================================= -__C infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetFlipudWorkspaceSize(infiniopFlipudDescriptor_t desc, size_t *size) { - #define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API GET(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef GET +#undef GET } // ======================================================================= // 3. 执行计算 (Calculate) // ======================================================================= -__C infiniStatus_t infiniopFlipud( +__INFINI_C infiniStatus_t infiniopFlipud( infiniopFlipudDescriptor_t desc, void *workspace, size_t workspace_size, @@ -108,69 +108,69 @@ __C infiniStatus_t infiniopFlipud( const void *input, void *stream) { - #define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, input, stream) +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CALCULATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CALCULATE +#undef CALCULATE } // ======================================================================= // 4. 销毁描述符 // ======================================================================= -__C infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyFlipudDescriptor(infiniopFlipudDescriptor_t desc) { - #define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API DELETE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef DELETE +#undef DELETE } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/src/infiniop/ops/float_power/cpu/float_power_cpu.cc b/src/infiniop/ops/float_power/cpu/float_power_cpu.cc index 32cfcb6ec..852ed13e3 100644 --- a/src/infiniop/ops/float_power/cpu/float_power_cpu.cc +++ b/src/infiniop/ops/float_power/cpu/float_power_cpu.cc @@ -1,9 +1,9 @@ #include "float_power_cpu.h" +#include "../../../../utils/custom_types.h" #include "../../../devices/cpu/common_cpu.h" #include #include #include -#include "../../../../utils/custom_types.h" namespace op::float_power::cpu { @@ -17,22 +17,21 @@ infiniStatus_t Descriptor::create( Descriptor **desc_ptr, infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, - float scalar_exponent) { - + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + auto handle = reinterpret_cast(handle_); - + // 创建 Info 对象进行校验 (Info 类已更新,支持混合精度和 Tensor 指数) auto result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); CHECK_RESULT(result); - + *desc_ptr = new Descriptor( nullptr, result.take(), 0, // CPU 不需要 workspace - handle->device, - handle->device_id - ); + handle->device, + handle->device_id); return INFINI_STATUS_SUCCESS; } @@ -49,11 +48,11 @@ void calculate_cpu_impl( const void *exponent_ptr) { size_t numel = info.num_elements(); - + // 获取指数模式 bool is_scalar = info.is_scalar_exponent(); float scalar_exp = info.scalar_exponent(); - + auto out_ptr = reinterpret_cast(output); auto in_ptr = reinterpret_cast(input); auto exp_ptr = reinterpret_cast(exponent_ptr); @@ -63,7 +62,7 @@ void calculate_cpu_impl( bool is_sqrt = is_scalar && (scalar_exp == 0.5f); bool is_identity = is_scalar && (scalar_exp == 1.0f); - #pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) for (size_t i = 0; i < numel; ++i) { // 1. 读取输入并转为 float float in_val = utils::cast(in_ptr[i]); @@ -103,30 +102,30 @@ infiniStatus_t Descriptor::calculate( size_t workspace_size, void *output, const void *input, - const void *exponent, + const void *exponent, void *stream) const { auto in_dtype = _info.input_dtype(); auto out_dtype = _info.output_dtype(); - // 定义内层宏:根据 Output 类型分发 - #define DISPATCH_OUT(IN_T) \ - switch (out_dtype) { \ - case INFINI_DTYPE_F32: \ - cpu::calculate_cpu_impl(_info, output, input, exponent); \ - return INFINI_STATUS_SUCCESS; \ - case INFINI_DTYPE_F64: \ - cpu::calculate_cpu_impl(_info, output, input, exponent); \ - return INFINI_STATUS_SUCCESS; \ - case INFINI_DTYPE_F16: \ - cpu::calculate_cpu_impl(_info, output, input, exponent); \ - return INFINI_STATUS_SUCCESS; \ - case INFINI_DTYPE_BF16: \ - cpu::calculate_cpu_impl(_info, output, input, exponent); \ - return INFINI_STATUS_SUCCESS; \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } +// 定义内层宏:根据 Output 类型分发 +#define DISPATCH_OUT(IN_T) \ + switch (out_dtype) { \ + case INFINI_DTYPE_F32: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_F64: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_F16: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + case INFINI_DTYPE_BF16: \ + cpu::calculate_cpu_impl(_info, output, input, exponent); \ + return INFINI_STATUS_SUCCESS; \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } // 外层 Switch:根据 Input 类型分发 switch (in_dtype) { @@ -142,7 +141,7 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_BAD_TENSOR_DTYPE; } - #undef DISPATCH_OUT +#undef DISPATCH_OUT } -} // namespace op::float_power::cpu \ No newline at end of file +} // namespace op::float_power::cpu diff --git a/src/infiniop/ops/float_power/cpu/float_power_cpu.h b/src/infiniop/ops/float_power/cpu/float_power_cpu.h index 3f97c2726..10841fbcd 100644 --- a/src/infiniop/ops/float_power/cpu/float_power_cpu.h +++ b/src/infiniop/ops/float_power/cpu/float_power_cpu.h @@ -4,4 +4,4 @@ #include "../float_power.h" DESCRIPTOR(cpu) -#endif // __FLOAT_POWER_CPU_H__ \ No newline at end of file +#endif // __FLOAT_POWER_CPU_H__ diff --git a/src/infiniop/ops/float_power/cuda/kernel.cuh b/src/infiniop/ops/float_power/cuda/kernel.cuh index af07406ed..55a4a7eb1 100644 --- a/src/infiniop/ops/float_power/cuda/kernel.cuh +++ b/src/infiniop/ops/float_power/cuda/kernel.cuh @@ -1,9 +1,9 @@ #ifndef __FLOAT_POWER_CUDA_CUH__ #define __FLOAT_POWER_CUDA_CUH__ -#include -#include -#include #include +#include +#include +#include namespace op::float_power::cuda { @@ -25,17 +25,17 @@ struct FloatPowerFunctor { }; template __global__ void float_power_kernel( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_EXP * __restrict__ exponent, - float scalar_exponent, - bool is_scalar, - size_t numel, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_EXP *__restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, FloatPowerFunctor functor) { - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < numel; + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; idx += blockDim.x * gridDim.x) { - + float exp_val_f = is_scalar ? scalar_exponent : static_cast(exponent[idx]); output[idx] = static_cast(functor.compute(input[idx], exp_val_f)); } @@ -46,25 +46,25 @@ __global__ void float_power_kernel( // ================================================================== template __global__ void float_power_kernel_vectorized_scalar( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - float scalar_exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + float scalar_exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], scalar_exponent)); } @@ -73,27 +73,27 @@ __global__ void float_power_kernel_vectorized_scalar( } template __global__ void float_power_kernel_vectorized_tensor( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_IN * __restrict__ exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_IN *__restrict__ exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto exp_vec = reinterpret_cast(exponent); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; - PackTypeIn exp_pack = exp_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { float e = static_cast(exp_pack.val[i]); out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], e)); @@ -104,4 +104,4 @@ __global__ void float_power_kernel_vectorized_tensor( } // namespace op::float_power::cuda -#endif // __FLOAT_POWER_CUDA_CUH__ \ No newline at end of file +#endif // __FLOAT_POWER_CUDA_CUH__ diff --git a/src/infiniop/ops/float_power/float_power.h b/src/infiniop/ops/float_power/float_power.h index bf61ac36d..63906fe53 100644 --- a/src/infiniop/ops/float_power/float_power.h +++ b/src/infiniop/ops/float_power/float_power.h @@ -2,51 +2,51 @@ #define __FLOAT_POWER_H__ #include "../../operator.h" -#include "info.h" +#include "info.h" // 宏定义:用于生成不同命名空间下的 Descriptor 类 -#define DESCRIPTOR(NAMESPACE) \ - namespace op::float_power::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - FloatPowerInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - FloatPowerInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - /* [修改] 增加 exponent 张量描述符 和 scalar_exponent */ \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t y, \ - infiniopTensorDescriptor_t x, \ - infiniopTensorDescriptor_t exponent, \ - float scalar_exponent); \ - \ - /* [修改] 增加 exponent 数据指针 */ \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *y, \ - const void *x, \ - const void *exponent, \ - void *stream) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + namespace op::float_power::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + FloatPowerInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + FloatPowerInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + /* [修改] 增加 exponent 张量描述符 和 scalar_exponent */ \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y, \ + infiniopTensorDescriptor_t x, \ + infiniopTensorDescriptor_t exponent, \ + float scalar_exponent); \ + \ + /* [修改] 增加 exponent 数据指针 */ \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *exponent, \ + void *stream) const; \ + }; \ } -#endif // __FLOAT_POWER_H__ \ No newline at end of file +#endif // __FLOAT_POWER_H__ diff --git a/src/infiniop/ops/float_power/info.h b/src/infiniop/ops/float_power/info.h index 46252b5dd..baae2cabe 100644 --- a/src/infiniop/ops/float_power/info.h +++ b/src/infiniop/ops/float_power/info.h @@ -11,13 +11,13 @@ class FloatPowerInfo { FloatPowerInfo() = default; public: - int _input_dtype; // 输入数据类型 - int _output_dtype; // 输出数据类型 - - bool _is_scalar_exponent;// 是否为标量指数 - float _scalar_exponent; // 标量指数的值 (仅当 _is_scalar_exponent 为 true 时有效) - - size_t _num_elements; // 元素总数 + int _input_dtype; // 输入数据类型 + int _output_dtype; // 输出数据类型 + + bool _is_scalar_exponent; // 是否为标量指数 + float _scalar_exponent; // 标量指数的值 (仅当 _is_scalar_exponent 为 true 时有效) + + size_t _num_elements; // 元素总数 // Getters int input_dtype() const { return _input_dtype; } @@ -28,21 +28,21 @@ class FloatPowerInfo { // 构造函数 FloatPowerInfo(int in_dtype, int out_dtype, bool is_scalar, float scalar_exp, size_t numel) - : _input_dtype(in_dtype), _output_dtype(out_dtype), - _is_scalar_exponent(is_scalar), _scalar_exponent(scalar_exp), + : _input_dtype(in_dtype), _output_dtype(out_dtype), + _is_scalar_exponent(is_scalar), _scalar_exponent(scalar_exp), _num_elements(numel) {} static utils::Result create( infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t exponent_desc, + infiniopTensorDescriptor_t exponent_desc, float scalar_exponent) { if (out_desc->ndim() != input_desc->ndim()) { return INFINI_STATUS_BAD_TENSOR_SHAPE; } // 使用引用接收 vector,避免之前的编译错误 - const auto& in_shape = input_desc->shape(); - const auto& out_shape = out_desc->shape(); + const auto &in_shape = input_desc->shape(); + const auto &out_shape = out_desc->shape(); size_t count = 1; for (size_t i = 0; i < input_desc->ndim(); ++i) { @@ -59,7 +59,7 @@ class FloatPowerInfo { if (exponent_desc->ndim() != input_desc->ndim()) { return INFINI_STATUS_BAD_TENSOR_SHAPE; } - const auto& exp_shape = exponent_desc->shape(); + const auto &exp_shape = exponent_desc->shape(); for (size_t i = 0; i < input_desc->ndim(); ++i) { if (exp_shape[i] != in_shape[i]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -69,15 +69,15 @@ class FloatPowerInfo { // 构造 Info 对象 return utils::Result(FloatPowerInfo{ - input_desc->dtype(), // Input Dtype - out_desc->dtype(), // Output Dtype (分开存储) - is_scalar, // Mode flag - scalar_exponent, // Scalar Value - count // Total elements + input_desc->dtype(), // Input Dtype + out_desc->dtype(), // Output Dtype (分开存储) + is_scalar, // Mode flag + scalar_exponent, // Scalar Value + count // Total elements }); } }; } // namespace op::float_power -#endif // __FLOAT_POWER_INFO_H__ \ No newline at end of file +#endif // __FLOAT_POWER_INFO_H__ diff --git a/src/infiniop/ops/float_power/metax/float_power_metax.h b/src/infiniop/ops/float_power/metax/float_power_metax.h index dd8d08f54..5acf9d819 100644 --- a/src/infiniop/ops/float_power/metax/float_power_metax.h +++ b/src/infiniop/ops/float_power/metax/float_power_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif // __FLOAT_POWER_METAX_API_H__ \ No newline at end of file +#endif // __FLOAT_POWER_METAX_API_H__ diff --git a/src/infiniop/ops/float_power/metax/float_power_metax.maca b/src/infiniop/ops/float_power/metax/float_power_metax.maca index 14c7e65dc..b398f16e7 100644 --- a/src/infiniop/ops/float_power/metax/float_power_metax.maca +++ b/src/infiniop/ops/float_power/metax/float_power_metax.maca @@ -1,16 +1,15 @@ -#include "float_power_metax.h" #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" -#include -#include +#include "float_power_metax.h" #include #include -#include +#include #include +#include +#include using nv_bfloat16 = __maca_bfloat16; using nv_bfloat162 = __maca_bfloat162; - namespace op::float_power::metax { // 基础定义: 向量化数据打包结构 @@ -32,18 +31,18 @@ struct FloatPowerFunctor { // Kernel 1: 通用处理 (Grid-Stride Loop) template __global__ void float_power_kernel( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_EXP * __restrict__ exponent, - float scalar_exponent, - bool is_scalar, - size_t numel, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_EXP *__restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, FloatPowerFunctor functor) { - - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < numel; + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; idx += blockDim.x * gridDim.x) { - + float exp_val_f = is_scalar ? scalar_exponent : static_cast(exponent[idx]); output[idx] = static_cast(functor.compute(input[idx], exp_val_f)); } @@ -52,25 +51,25 @@ __global__ void float_power_kernel( // Kernel 2: 标量模式向量化 Kernel template __global__ void float_power_kernel_vectorized_scalar( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - float scalar_exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + float scalar_exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], scalar_exponent)); } @@ -81,27 +80,27 @@ __global__ void float_power_kernel_vectorized_scalar( // Kernel 3: 张量模式向量化 Kernel template __global__ void float_power_kernel_vectorized_tensor( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_IN * __restrict__ exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_IN *__restrict__ exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto exp_vec = reinterpret_cast(exponent); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; - PackTypeIn exp_pack = exp_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { float e = static_cast(exp_pack.val[i]); out_pack.val[i] = static_cast(functor.compute(in_pack.val[i], e)); @@ -123,9 +122,9 @@ static inline bool is_aligned(const void *ptr, size_t alignment) { // Launcher Implementation template void launch_kernel( - void *output, - const void *input, - const void *exponent, + void *output, + const void *input, + const void *exponent, const FloatPowerInfo &info, void *stream) { @@ -144,17 +143,13 @@ void launch_kernel( // ------------------------------------------------------------------ // 向量化分发路径 // ------------------------------------------------------------------ - constexpr int AlignBytes = 16; + constexpr int AlignBytes = 16; constexpr int PackSizeIn = AlignBytes / sizeof(T_IN); // 检查输入输出类型大小是否一致 bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); - bool can_vectorize_base = types_same_size && - (PackSizeIn > 1) && - (numel % PackSizeIn == 0) && - is_aligned(input, AlignBytes) && - is_aligned(output, AlignBytes); + bool can_vectorize_base = types_same_size && (PackSizeIn > 1) && (numel % PackSizeIn == 0) && is_aligned(input, AlignBytes) && is_aligned(output, AlignBytes); if (can_vectorize_base) { size_t num_packs = numel / PackSizeIn; @@ -165,15 +160,13 @@ void launch_kernel( // 路径 A1: 标量指数向量化 float_power_kernel_vectorized_scalar <<>>( - out_ptr, in_ptr, scalar_exp, num_packs, functor - ); + out_ptr, in_ptr, scalar_exp, num_packs, functor); return; } else if (is_aligned(exponent, AlignBytes)) { // 路径 A2: 张量指数向量化 float_power_kernel_vectorized_tensor <<>>( - out_ptr, in_ptr, exp_ptr, num_packs, functor - ); + out_ptr, in_ptr, exp_ptr, num_packs, functor); return; } } @@ -186,8 +179,7 @@ void launch_kernel( float_power_kernel <<>>( - out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor - ); + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor); } // ================================================================== @@ -195,18 +187,24 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { if (_opaque) delete _opaque; } +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, - infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, + infiniopTensorDescriptor_t exponent, float scalar_exponent) { auto handle = reinterpret_cast(handle_); auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } size_t workspace_size = 0; *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); @@ -215,7 +213,7 @@ infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - const void *input, const void *exponent, + const void *input, const void *exponent, void *stream) const { auto in_dtype = _info.input_dtype(); @@ -241,7 +239,8 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_BF16: launch_kernel(output, input, exponent, _info, stream); break; - default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; } break; @@ -259,7 +258,8 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_BF16: launch_kernel(output, input, exponent, _info, stream); break; - default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; } break; @@ -277,7 +277,8 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_BF16: launch_kernel(output, input, exponent, _info, stream); break; - default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; } break; @@ -295,7 +296,8 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_BF16: launch_kernel(output, input, exponent, _info, stream); break; - default: return INFINI_STATUS_BAD_TENSOR_DTYPE; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; } break; @@ -306,4 +308,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::float_power::metax \ No newline at end of file +} // namespace op::float_power::metax diff --git a/src/infiniop/ops/float_power/moore/float_power_moore.h b/src/infiniop/ops/float_power/moore/float_power_moore.h index 4f959fdf0..3028df383 100644 --- a/src/infiniop/ops/float_power/moore/float_power_moore.h +++ b/src/infiniop/ops/float_power/moore/float_power_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif // __FLOAT_POWER_MOORE_H__ \ No newline at end of file +#endif // __FLOAT_POWER_MOORE_H__ diff --git a/src/infiniop/ops/float_power/moore/float_power_moore.mu b/src/infiniop/ops/float_power/moore/float_power_moore.mu index 721820018..f79403713 100644 --- a/src/infiniop/ops/float_power/moore/float_power_moore.mu +++ b/src/infiniop/ops/float_power/moore/float_power_moore.mu @@ -1,8 +1,8 @@ +#include "../../../devices/moore/moore_handle.h" #include "float_power_moore.h" #include "float_power_moore_kernel.h" -#include "../../../devices/moore/moore_handle.h" -#include #include +#include namespace op::float_power::moore { @@ -19,9 +19,9 @@ bool is_aligned(const void *ptr, size_t alignment) { // ================================================================== template void launch_kernel( - void *output, - const void *input, - const void *exponent, + void *output, + const void *input, + const void *exponent, const FloatPowerInfo &info, void *stream) { @@ -45,11 +45,7 @@ void launch_kernel( // 只有当输入和输出类型大小相同时,当前的 1:1 Pack 向量化逻辑才生效 bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); - bool can_vectorize_base = types_same_size && - (PackSizeIn > 1) && - (numel % PackSizeIn == 0) && - is_aligned(input, AlignBytes) && - is_aligned(output, AlignBytes); + bool can_vectorize_base = types_same_size && (PackSizeIn > 1) && (numel % PackSizeIn == 0) && is_aligned(input, AlignBytes) && is_aligned(output, AlignBytes); if (can_vectorize_base) { size_t num_packs = numel / PackSizeIn; @@ -60,15 +56,13 @@ void launch_kernel( // 路径 A1: 标量指数向量化 op::float_power::moore::float_power_kernel_vectorized_scalar <<>>( - out_ptr, in_ptr, scalar_exp, num_packs, functor - ); + out_ptr, in_ptr, scalar_exp, num_packs, functor); return; } else if (is_aligned(exponent, AlignBytes)) { // 路径 A2: 张量指数向量化 op::float_power::moore::float_power_kernel_vectorized_tensor <<>>( - out_ptr, in_ptr, exp_ptr, num_packs, functor - ); + out_ptr, in_ptr, exp_ptr, num_packs, functor); return; } } @@ -81,8 +75,7 @@ void launch_kernel( op::float_power::moore::float_power_kernel <<>>( - out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor - ); + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor); } // ================================================================== @@ -90,19 +83,25 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { if (_opaque) delete _opaque; } +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, - infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, + infiniopTensorDescriptor_t exponent, float scalar_exponent) { auto handle = reinterpret_cast(handle_); auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } size_t workspace_size = 0; *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); @@ -111,7 +110,7 @@ infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - const void *input, const void *exponent, + const void *input, const void *exponent, void *stream) const { auto in_dtype = _info.input_dtype(); @@ -201,4 +200,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::float_power::moore \ No newline at end of file +} // namespace op::float_power::moore diff --git a/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h b/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h index d2ea6f33f..1782f82ee 100644 --- a/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h +++ b/src/infiniop/ops/float_power/moore/float_power_moore_kernel.h @@ -1,10 +1,10 @@ #ifndef __FLOAT_POWER_MOORE_KERNEL_H__ #define __FLOAT_POWER_MOORE_KERNEL_H__ -#include -#include -#include #include +#include +#include +#include #include namespace op::float_power::moore { @@ -59,19 +59,19 @@ struct FloatPowerFunctor { // ================================================================== template __global__ void float_power_kernel( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_EXP * __restrict__ exponent, - float scalar_exponent, - bool is_scalar, - size_t numel, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_EXP *__restrict__ exponent, + float scalar_exponent, + bool is_scalar, + size_t numel, FloatPowerFunctor functor) { - + // Grid-Stride Loop - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < numel; + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < numel; idx += blockDim.x * gridDim.x) { - + float exp_val_f = is_scalar ? scalar_exponent : to_float(exponent[idx]); output[idx] = from_float(functor.compute(input[idx], exp_val_f)); } @@ -82,25 +82,25 @@ __global__ void float_power_kernel( // ================================================================== template __global__ void float_power_kernel_vectorized_scalar( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - float scalar_exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + float scalar_exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { out_pack.val[i] = from_float(functor.compute(in_pack.val[i], scalar_exponent)); } @@ -113,27 +113,27 @@ __global__ void float_power_kernel_vectorized_scalar( // ================================================================== template __global__ void float_power_kernel_vectorized_tensor( - T_OUT * __restrict__ output, - const T_IN * __restrict__ input, - const T_IN * __restrict__ exponent, - size_t num_packs, + T_OUT *__restrict__ output, + const T_IN *__restrict__ input, + const T_IN *__restrict__ exponent, + size_t num_packs, FloatPowerFunctor functor) { - + using PackTypeIn = Pack; using PackTypeOut = Pack; - auto in_vec = reinterpret_cast(input); - auto exp_vec = reinterpret_cast(exponent); - auto out_vec = reinterpret_cast(output); - + auto in_vec = reinterpret_cast(input); + auto exp_vec = reinterpret_cast(exponent); + auto out_vec = reinterpret_cast(output); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - + if (idx < num_packs) { PackTypeIn in_pack = in_vec[idx]; - PackTypeIn exp_pack = exp_vec[idx]; + PackTypeIn exp_pack = exp_vec[idx]; PackTypeOut out_pack; - - #pragma unroll + +#pragma unroll for (int i = 0; i < PackSize; ++i) { float e = to_float(exp_pack.val[i]); out_pack.val[i] = from_float(functor.compute(in_pack.val[i], e)); @@ -144,4 +144,4 @@ __global__ void float_power_kernel_vectorized_tensor( } // namespace op::float_power::moore -#endif // __FLOAT_POWER_MOORE_KERNEL_H__ \ No newline at end of file +#endif // __FLOAT_POWER_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu index 24e57508a..548e37189 100644 --- a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu +++ b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cu @@ -1,8 +1,8 @@ -#include "float_power_nvidia.cuh" -#include "../cuda/kernel.cuh" #include "../../../handle.h" -#include +#include "../cuda/kernel.cuh" +#include "float_power_nvidia.cuh" #include +#include namespace op::float_power::nvidia { @@ -15,9 +15,9 @@ bool is_aligned(const void *ptr, size_t alignment) { } template void launch_kernel( - void *output, - const void *input, - const void *exponent, + void *output, + const void *input, + const void *exponent, const FloatPowerInfo &info, void *stream) { @@ -40,11 +40,7 @@ void launch_kernel( constexpr int PackSizeIn = AlignBytes / sizeof(T_IN); bool types_same_size = (sizeof(T_IN) == sizeof(T_OUT)); - bool can_vectorize_base = types_same_size && - (PackSizeIn > 1) && - (numel % PackSizeIn == 0) && - is_aligned(input, AlignBytes) && - is_aligned(output, AlignBytes); + bool can_vectorize_base = types_same_size && (PackSizeIn > 1) && (numel % PackSizeIn == 0) && is_aligned(input, AlignBytes) && is_aligned(output, AlignBytes); if (can_vectorize_base) { size_t num_packs = numel / PackSizeIn; @@ -55,15 +51,13 @@ void launch_kernel( // 路径 A1: 标量指数向量化(极快) op::float_power::cuda::float_power_kernel_vectorized_scalar <<>>( - out_ptr, in_ptr, scalar_exp, num_packs, functor - ); + out_ptr, in_ptr, scalar_exp, num_packs, functor); return; } else if (is_aligned(exponent, AlignBytes)) { // 路径 A2: 张量指数向量化(解决 0.2x 倍速问题的核心) op::float_power::cuda::float_power_kernel_vectorized_tensor <<>>( - out_ptr, in_ptr, exp_ptr, num_packs, functor - ); + out_ptr, in_ptr, exp_ptr, num_packs, functor); return; } } @@ -77,8 +71,7 @@ void launch_kernel( op::float_power::cuda::float_power_kernel <<>>( - out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor - ); + out_ptr, in_ptr, exp_ptr, scalar_exp, is_scalar, numel, functor); } // ================================================================== @@ -86,17 +79,23 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { if (_opaque) delete _opaque; } +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, - infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, + infiniopTensorDescriptor_t exponent, float scalar_exponent) { auto info_result = FloatPowerInfo::create(y, x, exponent, scalar_exponent); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } size_t workspace_size = 0; *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); @@ -105,7 +104,7 @@ infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - const void *input, const void *exponent, + const void *input, const void *exponent, void *stream) const { auto in_dtype = _info.input_dtype(); @@ -199,4 +198,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::float_power::nvidia \ No newline at end of file +} // namespace op::float_power::nvidia diff --git a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh index cb170b339..071e2434a 100644 --- a/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh +++ b/src/infiniop/ops/float_power/nvidia/float_power_nvidia.cuh @@ -4,4 +4,4 @@ #include "../float_power.h" DESCRIPTOR(nvidia) -#endif // __FLOAT_POWER_NVIDIA_CUH__ \ No newline at end of file +#endif // __FLOAT_POWER_NVIDIA_CUH__ diff --git a/src/infiniop/ops/float_power/operator.cc b/src/infiniop/ops/float_power/operator.cc index 428ecf0e5..87b3e248c 100644 --- a/src/infiniop/ops/float_power/operator.cc +++ b/src/infiniop/ops/float_power/operator.cc @@ -22,88 +22,88 @@ extern "C" { // ======================================================================= // 1. 创建算子描述符 // ======================================================================= -__C infiniStatus_t infiniopCreateFloatPowerDescriptor( +__INFINI_C infiniStatus_t infiniopCreateFloatPowerDescriptor( infiniopHandle_t handle, infiniopFloatPowerDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t exponent, - float scalar_exponent) { - - #define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::float_power::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y, \ - x, \ - exponent, \ - scalar_exponent) + infiniopTensorDescriptor_t exponent, + float scalar_exponent) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::float_power::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + exponent, \ + scalar_exponent) switch (handle->device) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CREATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CREATE +#undef CREATE } // ======================================================================= // 2. 获取 Workspace 大小 // ======================================================================= -__C infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetFloatPowerWorkspaceSize(infiniopFloatPowerDescriptor_t desc, size_t *size) { - #define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API GET(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef GET +#undef GET } // ======================================================================= // 3. 执行计算 (Calculate) // ======================================================================= -__C infiniStatus_t infiniopFloatPower( +__INFINI_C infiniStatus_t infiniopFloatPower( infiniopFloatPowerDescriptor_t desc, void *workspace, size_t workspace_size, @@ -112,69 +112,69 @@ __C infiniStatus_t infiniopFloatPower( const void *exponent, // [新增参数] void *stream) { - #define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, x, exponent, stream) +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, exponent, stream) switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CALCULATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CALCULATE +#undef CALCULATE } // ======================================================================= // 4. 销毁描述符 // ======================================================================= -__C infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyFloatPowerDescriptor(infiniopFloatPowerDescriptor_t desc) { - #define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API DELETE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef DELETE +#undef DELETE } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc index 147221a77..e36261593 100644 --- a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc @@ -55,4 +55,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::floor_divide::cpu \ No newline at end of file +} // namespace op::floor_divide::cpu diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h index ec5fcfac1..ea828d297 100644 --- a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h @@ -1,9 +1,9 @@ #ifndef __FLOOR_DIVIDE_CPU_H__ #define __FLOOR_DIVIDE_CPU_H__ +#include "../../../elementwise/cpu/elementwise_cpu.h" #include #include -#include "../../../elementwise/cpu/elementwise_cpu.h" ELEMENTWISE_DESCRIPTOR(floor_divide, cpu) @@ -27,4 +27,4 @@ typedef struct FloorDivideOp { } FloorDivideOp; } // namespace op::floor_divide::cpu -#endif // __FLOOR_DIVIDE_CPU_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_CPU_H__ diff --git a/src/infiniop/ops/floor_divide/cuda/kernel.cuh b/src/infiniop/ops/floor_divide/cuda/kernel.cuh index 9f77280f1..4ca291abb 100644 --- a/src/infiniop/ops/floor_divide/cuda/kernel.cuh +++ b/src/infiniop/ops/floor_divide/cuda/kernel.cuh @@ -33,4 +33,4 @@ public: } FloorDivideOp; } // namespace op::floor_divide::cuda -#endif // __FLOOR_DIVIDE_CUDA_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_CUDA_H__ diff --git a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h index d77b7af90..fa29a8b7c 100644 --- a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h +++ b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.h @@ -5,4 +5,4 @@ ELEMENTWISE_DESCRIPTOR(floor_divide, metax) -#endif // __FLOOR_DIVIDE_METAX_API_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_METAX_API_H__ diff --git a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca index 078b63690..bae206211 100644 --- a/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca +++ b/src/infiniop/ops/floor_divide/metax/floor_divide_metax.maca @@ -1,8 +1,8 @@ -#include "floor_divide_metax.h" #include "../../../elementwise/metax/elementwise_metax.h" -#include -#include +#include "floor_divide_metax.h" #include +#include +#include using nv_bfloat16 = __maca_bfloat16; using nv_bfloat162 = __maca_bfloat162; @@ -24,20 +24,20 @@ struct FloorDivideOp { res.x = floorf(fa.x / fb.x); res.y = floorf(fa.y / fb.y); return __float22half2_rn(res); - } + } // ------------------------------------------------ // 2. Half 标量 // ------------------------------------------------ else if constexpr (std::is_same_v) { return __float2half(floorf(__half2float(a) / __half2float(b))); - } + } // ------------------------------------------------ // 3. BFloat16 // ------------------------------------------------ else if constexpr (std::is_same_v) { float val = __bfloat162float(a) / __bfloat162float(b); return __float2bfloat16(floorf(val)); - } + } // ------------------------------------------------ // 4. Float / Double // ------------------------------------------------ @@ -45,7 +45,7 @@ struct FloorDivideOp { return floorf(a / b); } else if constexpr (std::is_same_v) { return floor(a / b); - } + } // ------------------------------------------------ // 5. 整数类型 (Int32 / Int64) // ------------------------------------------------ @@ -121,4 +121,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::floor_divide::metax \ No newline at end of file +} // namespace op::floor_divide::metax diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h index e14c09e2e..c87f8dd63 100644 --- a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.h @@ -5,4 +5,4 @@ ELEMENTWISE_DESCRIPTOR(floor_divide, moore) -#endif // __FLOOR_DIVIDE_MOORE_API_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_MOORE_API_H__ diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu index f5fce2b6f..f707d0ac0 100644 --- a/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore.mu @@ -1,7 +1,7 @@ +#include "../../../devices/moore/moore_handle.h" #include "../../../elementwise/moore/elementwise_moore.h" #include "floor_divide_moore.h" #include "floor_divide_moore_kernel.h" -#include "../../../devices/moore/moore_handle.h" namespace op::floor_divide::moore { @@ -66,4 +66,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::floor_divide::moore \ No newline at end of file +} // namespace op::floor_divide::moore diff --git a/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h b/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h index c911cbdfb..5ea4a4ced 100644 --- a/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h +++ b/src/infiniop/ops/floor_divide/moore/floor_divide_moore_kernel.h @@ -1,10 +1,10 @@ #ifndef __FLOOR_DIVIDE_MOORE_H__ #define __FLOOR_DIVIDE_MOORE_H__ -#include +#include #include +#include #include -#include namespace op::floor_divide::moore { typedef struct FloorDivideOp { @@ -36,4 +36,4 @@ typedef struct FloorDivideOp { } FloorDivideOp; } // namespace op::floor_divide::moore -#endif // __FLOOR_DIVIDE_MOORE_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_MOORE_H__ diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu index 830fe3b05..15206c9d3 100644 --- a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu @@ -62,4 +62,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::floor_divide::nvidia \ No newline at end of file +} // namespace op::floor_divide::nvidia diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh index 684c6d189..1c70343cf 100644 --- a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh @@ -5,4 +5,4 @@ ELEMENTWISE_DESCRIPTOR(floor_divide, nvidia) -#endif // __FLOOR_DIVIDE_CUDA_API_H__ \ No newline at end of file +#endif // __FLOOR_DIVIDE_CUDA_API_H__ diff --git a/src/infiniop/ops/floor_divide/operator.cc b/src/infiniop/ops/floor_divide/operator.cc index 320af088f..a5fced100 100644 --- a/src/infiniop/ops/floor_divide/operator.cc +++ b/src/infiniop/ops/floor_divide/operator.cc @@ -21,20 +21,20 @@ #include "moore/floor_divide_moore.h" #endif -__C infiniStatus_t infiniopCreateFloorDivideDescriptor( +__INFINI_C infiniStatus_t infiniopCreateFloorDivideDescriptor( infiniopHandle_t handle, infiniopFloorDivideDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::floor_divide::NAMESPACE::Descriptor::create( \ - handle, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::floor_divide::NAMESPACE::Descriptor::create( \ + handle, \ reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ + c_desc, \ + {a_desc, \ b_desc}) switch (handle->device) { @@ -71,10 +71,10 @@ __C infiniStatus_t infiniopCreateFloorDivideDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS @@ -111,7 +111,7 @@ __C infiniStatus_t infiniopGetFloorDivideWorkspaceSize(infiniopFloorDivideDescri return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopFloorDivide( +__INFINI_C infiniStatus_t infiniopFloorDivide( infiniopFloorDivideDescriptor_t desc, void *workspace, size_t workspace_size, @@ -120,8 +120,8 @@ __C infiniStatus_t infiniopFloorDivide( const void *b, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, c, {a, b}, stream) @@ -159,11 +159,11 @@ __C infiniStatus_t infiniopFloorDivide( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS @@ -199,4 +199,4 @@ infiniopDestroyFloorDivideDescriptor(infiniopFloorDivideDescriptor_t desc) { } #undef DELETE -} \ No newline at end of file +} diff --git a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc index 4e3f6d4d6..0c921a1d4 100644 --- a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc +++ b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.cc @@ -2,8 +2,8 @@ #include "../../../devices/cpu/common_cpu.h" #include #include -#include #include +#include #include "../../../../utils/custom_types.h" @@ -30,17 +30,16 @@ infiniStatus_t Descriptor::create( int reduction) { auto handle = reinterpret_cast(handle_); - + auto result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); CHECK_RESULT(result); *desc_ptr = new Descriptor( new Opaque(), result.take(), - 0, - handle->device, - handle->device_id - ); + 0, + handle->device, + handle->device_id); return INFINI_STATUS_SUCCESS; } @@ -66,21 +65,23 @@ void calculate_cpu_impl( auto weight_ptr = reinterpret_cast(weight); if (reduction == 0) { - #pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) for (size_t n = 0; n < N; ++n) { int64_t target_idx = tar_ptr[n]; - + if (target_idx < 0 || target_idx >= static_cast(C)) { out_ptr[n] = utils::cast(0.0f); continue; } - const T* row_ptr = in_ptr + n * C; + const T *row_ptr = in_ptr + n * C; float target_score = utils::cast(row_ptr[target_idx]); float sum_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = utils::cast(row_ptr[c]); float diff = margin - target_score + other_score; @@ -102,18 +103,22 @@ void calculate_cpu_impl( } else { double total_loss = 0.0; - #pragma omp parallel for reduction(+:total_loss) schedule(static) +#pragma omp parallel for reduction(+ : total_loss) schedule(static) for (size_t n = 0; n < N; ++n) { int64_t target_idx = tar_ptr[n]; - if (target_idx < 0 || target_idx >= static_cast(C)) continue; + if (target_idx < 0 || target_idx >= static_cast(C)) { + continue; + } - const T* row_ptr = in_ptr + n * C; + const T *row_ptr = in_ptr + n * C; float target_score = utils::cast(row_ptr[target_idx]); float sum_sample_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = utils::cast(row_ptr[c]); float diff = margin - target_score + other_score; @@ -133,7 +138,7 @@ void calculate_cpu_impl( total_loss += static_cast(sum_sample_loss); } - if (reduction == 1) { + if (reduction == 1) { total_loss /= static_cast(N); } @@ -172,4 +177,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::multi_margin_loss::cpu \ No newline at end of file +} // namespace op::multi_margin_loss::cpu diff --git a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h index 39098ff7d..f009b41b8 100644 --- a/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h +++ b/src/infiniop/ops/multi_margin_loss/cpu/multi_margin_loss_cpu.h @@ -5,4 +5,4 @@ DESCRIPTOR(cpu) -#endif // __MULTI_MARGIN_LOSS_CPU_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_CPU_H__ diff --git a/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh b/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh index 7261a99a4..2f372d8d6 100644 --- a/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh +++ b/src/infiniop/ops/multi_margin_loss/cuda/kernel.cuh @@ -1,12 +1,12 @@ #ifndef __MULTI_MARGIN_LOSS_CUDA_CUH__ #define __MULTI_MARGIN_LOSS_CUDA_CUH__ -#include -#include #include +#include +#include #include -#include +#include namespace op::multi_margin_loss::cuda { template @@ -19,8 +19,9 @@ struct alignas(sizeof(T) * N) Pack { // ================================================================== __device__ __forceinline__ float warpReduceSum(float val) { unsigned int mask = 0xffffffff; - for (int offset = warpSize / 2; offset > 0; offset /= 2) + for (int offset = warpSize / 2; offset > 0; offset /= 2) { val += __shfl_down_sync(mask, val, offset); + } return val; } @@ -30,12 +31,16 @@ __device__ __forceinline__ float blockReduceSum(float val) { int wid = threadIdx.x / warpSize; val = warpReduceSum(val); - if (lane == 0) shared[wid] = val; + if (lane == 0) { + shared[wid] = val; + } __syncthreads(); // 假设 BlockDim 也是 32 的倍数 val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; - if (wid == 0) val = warpReduceSum(val); + if (wid == 0) { + val = warpReduceSum(val); + } return val; } @@ -46,7 +51,7 @@ struct MultiMarginLossFunctor { int p; float margin; - __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) : p(p_val), margin(margin_val) {} // 计算单个 class c 的 loss 分量 @@ -60,19 +65,19 @@ struct MultiMarginLossFunctor { }; template __global__ void multi_margin_loss_kernel( - T * __restrict__ output, // [N] - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] (Optional) + T *__restrict__ output, // [N] + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] (Optional) size_t N, size_t C, MultiMarginLossFunctor functor) { size_t n = blockIdx.x * blockDim.x + threadIdx.x; - + if (n < N) { int64_t target_idx = target[n]; - + // 越界检查 if (target_idx < 0 || target_idx >= static_cast(C)) { output[n] = static_cast(0.0f); @@ -80,13 +85,15 @@ __global__ void multi_margin_loss_kernel( } // 定位当前行的起始位置 - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = static_cast(row_ptr[target_idx]); float sum_loss = 0.0f; // 遍历所有类别 for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = static_cast(row_ptr[c]); float diff = functor.margin - target_score + other_score; @@ -107,10 +114,10 @@ __global__ void multi_margin_loss_kernel( } template __global__ void multi_margin_loss_reduce_kernel( - float * output, // [1] Accumulator (Float) - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] + float *output, // [1] Accumulator (Float) + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] size_t N, size_t C, MultiMarginLossFunctor functor, @@ -123,20 +130,22 @@ __global__ void multi_margin_loss_reduce_kernel( // Grid-Stride Loop over Batch Dimension N for (size_t n = idx; n < N; n += stride) { int64_t target_idx = target[n]; - + if (target_idx >= 0 && target_idx < static_cast(C)) { - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = static_cast(row_ptr[target_idx]); float sample_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = static_cast(row_ptr[c]); float diff = functor.margin - target_score + other_score; sample_loss += functor.compute(diff); } - + sample_loss /= static_cast(C); if (weight != nullptr) { @@ -157,10 +166,10 @@ __global__ void multi_margin_loss_reduce_kernel( } } template -__global__ void cast_float_to_t(T* output, const float* src) { +__global__ void cast_float_to_t(T *output, const float *src) { *output = static_cast(*src); } } // namespace op::multi_margin_loss::cuda -#endif // __MULTI_MARGIN_LOSS_CUDA_CUH__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_CUDA_CUH__ diff --git a/src/infiniop/ops/multi_margin_loss/info.h b/src/infiniop/ops/multi_margin_loss/info.h index d460639c1..d8900063c 100644 --- a/src/infiniop/ops/multi_margin_loss/info.h +++ b/src/infiniop/ops/multi_margin_loss/info.h @@ -29,7 +29,7 @@ class MultiMarginLossInfo { // 构造函数 MultiMarginLossInfo(int dtype, int p, float margin, int reduction, bool has_weight, size_t batch, size_t classes) - : _dtype(dtype), _p(p), _margin(margin), _reduction(reduction), + : _dtype(dtype), _p(p), _margin(margin), _reduction(reduction), _has_weight(has_weight), _batch_size(batch), _num_classes(classes) {} static utils::Result create( @@ -44,7 +44,7 @@ class MultiMarginLossInfo { // 1. 检查输入形状 (Input vs Target) // Input: (N, C), Target: (N) if (input_desc->ndim() != 2) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; + return INFINI_STATUS_BAD_TENSOR_SHAPE; } if (target_desc->ndim() != 1) { return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -85,7 +85,7 @@ class MultiMarginLossInfo { } } if (p != 1 && p != 2) { - return INFINI_STATUS_BAD_PARAM; + return INFINI_STATUS_BAD_PARAM; } return utils::Result(MultiMarginLossInfo{ input_desc->dtype(), // _dtype @@ -101,4 +101,4 @@ class MultiMarginLossInfo { } // namespace op::multi_margin_loss -#endif // __MULTI_MARGIN_LOSS_INFO_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_INFO_H__ diff --git a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h index c7b3043cd..40624bebd 100644 --- a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h +++ b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif // __MULTI_MARGIN_LOSS_METAX_API_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_METAX_API_H__ diff --git a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca index a5e133e7f..0dd239fd8 100644 --- a/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca +++ b/src/infiniop/ops/multi_margin_loss/metax/multi_margin_loss_metax.maca @@ -1,20 +1,20 @@ -#include "multi_margin_loss_metax.h" #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" -#include -#include +#include "multi_margin_loss_metax.h" #include -#include #include +#include +#include +#include // ================================================================== // 1. MACA 类型兼容 // ================================================================== #if defined(__MACA__) || defined(__MACACC__) - #include - #include - using nv_bfloat16 = __maca_bfloat16; - using nv_bfloat162 = __maca_bfloat162; +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; #endif namespace op::multi_margin_loss::metax { @@ -28,7 +28,7 @@ struct MultiMarginLossFunctor { int p; float margin; - __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) : p(p_val), margin(margin_val) {} // 计算单个 class c 的 loss 分量 @@ -45,31 +45,33 @@ struct MultiMarginLossFunctor { // ------------------------------------------------------------------ template __global__ void multi_margin_loss_kernel( - T * __restrict__ output, // [N] - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] (Optional) + T *__restrict__ output, // [N] + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] (Optional) size_t N, size_t C, MultiMarginLossFunctor functor) { size_t n = blockIdx.x * blockDim.x + threadIdx.x; - + if (n < N) { int64_t target_idx = target[n]; - + // 越界检查 if (target_idx < 0 || target_idx >= static_cast(C)) { output[n] = static_cast(0.0f); return; } - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = static_cast(row_ptr[target_idx]); float sum_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = static_cast(row_ptr[c]); float diff = functor.margin - target_score + other_score; @@ -92,10 +94,10 @@ __global__ void multi_margin_loss_kernel( // ------------------------------------------------------------------ template __global__ void multi_margin_loss_reduce_kernel( - float * output, // [1] Accumulator (Float) - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] + float *output, // [1] Accumulator (Float) + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] size_t N, size_t C, MultiMarginLossFunctor functor, @@ -112,20 +114,22 @@ __global__ void multi_margin_loss_reduce_kernel( // 1. Grid-Stride Loop: 计算当前线程负责的所有样本的 Loss 总和 for (size_t n = idx; n < N; n += stride) { int64_t target_idx = target[n]; - + if (target_idx >= 0 && target_idx < static_cast(C)) { - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = static_cast(row_ptr[target_idx]); float sample_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = static_cast(row_ptr[c]); float diff = functor.margin - target_score + other_score; sample_loss += functor.compute(diff); } - + sample_loss /= static_cast(C); if (weight != nullptr) { @@ -146,14 +150,38 @@ __global__ void multi_margin_loss_reduce_kernel( __syncthreads(); // 3. Block 内树形归约 (Unrolled Tree Reduction) - if (tid < 128) { shared_mem[tid] += shared_mem[tid + 128]; } __syncthreads(); - if (tid < 64) { shared_mem[tid] += shared_mem[tid + 64]; } __syncthreads(); - if (tid < 32) { shared_mem[tid] += shared_mem[tid + 32]; } __syncthreads(); - if (tid < 16) { shared_mem[tid] += shared_mem[tid + 16]; } __syncthreads(); - if (tid < 8) { shared_mem[tid] += shared_mem[tid + 8]; } __syncthreads(); - if (tid < 4) { shared_mem[tid] += shared_mem[tid + 4]; } __syncthreads(); - if (tid < 2) { shared_mem[tid] += shared_mem[tid + 2]; } __syncthreads(); - if (tid < 1) { shared_mem[tid] += shared_mem[tid + 1]; } __syncthreads(); + if (tid < 128) { + shared_mem[tid] += shared_mem[tid + 128]; + } + __syncthreads(); + if (tid < 64) { + shared_mem[tid] += shared_mem[tid + 64]; + } + __syncthreads(); + if (tid < 32) { + shared_mem[tid] += shared_mem[tid + 32]; + } + __syncthreads(); + if (tid < 16) { + shared_mem[tid] += shared_mem[tid + 16]; + } + __syncthreads(); + if (tid < 8) { + shared_mem[tid] += shared_mem[tid + 8]; + } + __syncthreads(); + if (tid < 4) { + shared_mem[tid] += shared_mem[tid + 4]; + } + __syncthreads(); + if (tid < 2) { + shared_mem[tid] += shared_mem[tid + 2]; + } + __syncthreads(); + if (tid < 1) { + shared_mem[tid] += shared_mem[tid + 1]; + } + __syncthreads(); // 4. 将 Block 的结果原子累加到全局内存 if (tid == 0) { @@ -164,7 +192,7 @@ __global__ void multi_margin_loss_reduce_kernel( // Kernel 3: 类型转换 (Float -> T) template -__global__ void cast_float_to_t(T* output, const float* src) { +__global__ void cast_float_to_t(T *output, const float *src) { *output = static_cast(*src); } @@ -173,25 +201,25 @@ __global__ void cast_float_to_t(T* output, const float* src) { // ================================================================== template void launch_kernel( - void *output, - const void *input, - const void *target, + void *output, + const void *input, + const void *target, const void *weight, - void* workspace, - const MultiMarginLossInfo& info, + void *workspace, + const MultiMarginLossInfo &info, void *stream) { auto in_ptr = reinterpret_cast(input); auto out_ptr = reinterpret_cast(output); auto tar_ptr = reinterpret_cast(target); auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; - + auto mc_stream = reinterpret_cast(stream); - + size_t N = info.batch_size(); size_t C = info.num_classes(); int reduction = info.reduction(); - + MultiMarginLossFunctor functor(info.p(), info.margin()); // ------------------------------------------ @@ -200,32 +228,32 @@ void launch_kernel( if (reduction == 0) { size_t block_size = 256; size_t grid_size = (N + block_size - 1) / block_size; - + multi_margin_loss_kernel <<>>( - out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor - ); - } + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor); + } // ------------------------------------------ // Mode 2: Reduction (Mean / Sum) // ------------------------------------------ else { // 使用 workspace 作为临时的 float 累加器 - float* acc_ptr = reinterpret_cast(workspace); + float *acc_ptr = reinterpret_cast(workspace); mcMemsetAsync(acc_ptr, 0, sizeof(float), mc_stream); - + float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum - + // 强制 Block Size 为 256 以匹配 Kernel 内的手写归约逻辑 size_t block_size = 256; size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); - if (grid_size == 0) grid_size = 1; + if (grid_size == 0) { + grid_size = 1; + } multi_margin_loss_reduce_kernel <<>>( - acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale - ); - + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale); + // 将 float 结果转回目标类型 T cast_float_to_t <<<1, 1, 0, mc_stream>>>(out_ptr, acc_ptr); @@ -237,24 +265,28 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t target_desc, infiniopTensorDescriptor_t weight_desc, - int p, - float margin, + int p, + float margin, int reduction) { auto handle = reinterpret_cast(handle_); auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); - if (!info_result) return info_result.status(); - + if (!info_result) { + return info_result.status(); + } + // 如果需要归约,申请 4 字节 workspace 用于 atomicAdd size_t workspace_size = 0; if (reduction != 0) { @@ -266,11 +298,11 @@ infiniStatus_t Descriptor::create( } infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *target, + const void *input, + const void *target, const void *weight, void *stream) const { @@ -301,4 +333,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::multi_margin_loss::metax \ No newline at end of file +} // namespace op::multi_margin_loss::metax diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h index 0f926a971..1d88c5a38 100644 --- a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif // __MULTI_MARGIN_LOSS_MOORE_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_MOORE_H__ diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu index 0bb529dc4..0b4b43269 100644 --- a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore.mu @@ -1,8 +1,8 @@ +#include "../../../devices/moore/moore_handle.h" #include "multi_margin_loss_moore.h" #include "multi_margin_loss_moore_kernel.h" -#include "../../../devices/moore/moore_handle.h" -#include #include +#include namespace op::multi_margin_loss::moore { @@ -16,12 +16,12 @@ static inline bool is_aligned(const void *ptr, size_t alignment) { // ================================================================== template void launch_kernel( - void *output, - const void *input, - const void *target, + void *output, + const void *input, + const void *target, const void *weight, - void* workspace, - const MultiMarginLossInfo& info, + void *workspace, + const MultiMarginLossInfo &info, void *stream) { // 1. 准备指针 @@ -31,14 +31,14 @@ void launch_kernel( auto tar_ptr = reinterpret_cast(target); // Weight 是可选的 auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; - + auto musa_stream = reinterpret_cast(stream); - + // 2. 准备参数 size_t N = info.batch_size(); size_t C = info.num_classes(); int reduction = info.reduction(); - + op::multi_margin_loss::moore::MultiMarginLossFunctor functor(info.p(), info.margin()); // ------------------------------------------ @@ -48,29 +48,27 @@ void launch_kernel( // 每个线程处理一个样本 N size_t block_size = 256; size_t grid_size = (N + block_size - 1) / block_size; - + op::multi_margin_loss::moore::multi_margin_loss_kernel <<>>( - out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor - ); - } + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor); + } // ------------------------------------------ // 模式 2: Reduction (Mean / Sum) // ------------------------------------------ else { // 使用 workspace 作为临时的 float 累加器 (精度更高,且方便 atomicAdd) - float* acc_ptr = reinterpret_cast(workspace); + float *acc_ptr = reinterpret_cast(workspace); musaMemsetAsync(acc_ptr, 0, sizeof(float), musa_stream); float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum - + size_t block_size = 256; size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); op::multi_margin_loss::moore::multi_margin_loss_reduce_kernel <<>>( - acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale - ); - + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale); + // 将 float 累加结果转回 T 写入 output op::multi_margin_loss::moore::cast_float_to_t <<<1, 1, 0, musa_stream>>>(out_ptr, acc_ptr); @@ -82,48 +80,51 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, + infiniopHandle_t handle_, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t target_desc, infiniopTensorDescriptor_t weight_desc, - int p, - float margin, + int p, + float margin, int reduction) { auto handle = reinterpret_cast(handle_); auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); - if (!info_result) return info_result.status(); - + if (!info_result) { + return info_result.status(); + } + size_t workspace_size = 0; if (reduction != 0) { workspace_size = sizeof(float); } *desc_ptr = new Descriptor( - new Opaque(), - info_result.take(), - workspace_size, - handle->device, - handle->device_id - ); + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id); return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *target, + const void *input, + const void *target, const void *weight, void *stream) const { @@ -155,4 +156,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::multi_margin_loss::moore \ No newline at end of file +} // namespace op::multi_margin_loss::moore diff --git a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h index 889eb1bc9..b818ffe03 100644 --- a/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h +++ b/src/infiniop/ops/multi_margin_loss/moore/multi_margin_loss_moore_kernel.h @@ -1,11 +1,11 @@ #ifndef __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ #define __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ -#include -#include -#include #include -#include +#include +#include +#include +#include #include namespace op::multi_margin_loss::moore { @@ -45,8 +45,9 @@ __device__ __forceinline__ T from_float(float val) { // ================================================================== __device__ __forceinline__ float warpReduceSum(float val) { unsigned int mask = 0xffffffff; - for (int offset = warpSize / 2; offset > 0; offset /= 2) + for (int offset = warpSize / 2; offset > 0; offset /= 2) { val += __shfl_down_sync(mask, val, offset); + } return val; } @@ -56,12 +57,16 @@ __device__ __forceinline__ float blockReduceSum(float val) { int wid = threadIdx.x / warpSize; val = warpReduceSum(val); - if (lane == 0) shared[wid] = val; + if (lane == 0) { + shared[wid] = val; + } __syncthreads(); // 假设 BlockDim 也是 32 的倍数 val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; - if (wid == 0) val = warpReduceSum(val); + if (wid == 0) { + val = warpReduceSum(val); + } return val; } @@ -72,7 +77,7 @@ struct MultiMarginLossFunctor { int p; float margin; - __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) + __host__ __device__ MultiMarginLossFunctor(int p_val, float margin_val) : p(p_val), margin(margin_val) {} // 计算单个 class c 的 loss 分量 @@ -87,19 +92,19 @@ struct MultiMarginLossFunctor { template __global__ void multi_margin_loss_kernel( - T * __restrict__ output, // [N] - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] (Optional) + T *__restrict__ output, // [N] + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] (Optional) size_t N, size_t C, MultiMarginLossFunctor functor) { size_t n = blockIdx.x * blockDim.x + threadIdx.x; - + if (n < N) { int64_t target_idx = target[n]; - + // 越界检查 if (target_idx < 0 || target_idx >= static_cast(C)) { output[n] = from_float(0.0f); @@ -107,13 +112,15 @@ __global__ void multi_margin_loss_kernel( } // 定位当前行的起始位置 - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = to_float(row_ptr[target_idx]); float sum_loss = 0.0f; // 遍历所有类别 for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = to_float(row_ptr[c]); float diff = functor.margin - target_score + other_score; @@ -135,10 +142,10 @@ __global__ void multi_margin_loss_kernel( template __global__ void multi_margin_loss_reduce_kernel( - float * output, // [1] Accumulator (Float) - const T * __restrict__ input, // [N, C] - const int64_t * __restrict__ target, // [N] - const T * __restrict__ weight, // [C] + float *output, // [1] Accumulator (Float) + const T *__restrict__ input, // [N, C] + const int64_t *__restrict__ target, // [N] + const T *__restrict__ weight, // [C] size_t N, size_t C, MultiMarginLossFunctor functor, @@ -151,20 +158,22 @@ __global__ void multi_margin_loss_reduce_kernel( // Grid-Stride Loop over Batch Dimension N for (size_t n = idx; n < N; n += stride) { int64_t target_idx = target[n]; - + if (target_idx >= 0 && target_idx < static_cast(C)) { - const T* row_ptr = input + n * C; + const T *row_ptr = input + n * C; float target_score = to_float(row_ptr[target_idx]); float sample_loss = 0.0f; for (size_t c = 0; c < C; ++c) { - if (c == static_cast(target_idx)) continue; + if (c == static_cast(target_idx)) { + continue; + } float other_score = to_float(row_ptr[c]); float diff = functor.margin - target_score + other_score; sample_loss += functor.compute(diff); } - + sample_loss /= static_cast(C); if (weight != nullptr) { @@ -186,10 +195,10 @@ __global__ void multi_margin_loss_reduce_kernel( } template -__global__ void cast_float_to_t(T* output, const float* src) { +__global__ void cast_float_to_t(T *output, const float *src) { *output = from_float(*src); } } // namespace op::multi_margin_loss::moore -#endif // __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h b/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h index d19552855..3170ce600 100644 --- a/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h +++ b/src/infiniop/ops/multi_margin_loss/multi_margin_loss.h @@ -5,50 +5,50 @@ #include "info.h" // 引用对应的 MultiMarginLossInfo 定义 // 宏定义:用于生成不同命名空间下的 Descriptor 类 -#define DESCRIPTOR(NAMESPACE) \ - namespace op::multi_margin_loss::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - MultiMarginLossInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - MultiMarginLossInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - infiniopTensorDescriptor_t input_desc, \ - infiniopTensorDescriptor_t target_desc, \ - infiniopTensorDescriptor_t weight_desc, \ - int p, \ - float margin, \ - int reduction); \ - \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - const void *input, \ - const void *target, \ - const void *weight, \ - void *stream) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + namespace op::multi_margin_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MultiMarginLossInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + MultiMarginLossInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t target_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + int p, \ + float margin, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *target, \ + const void *weight, \ + void *stream) const; \ + }; \ } -#endif // __MULTI_MARGIN_LOSS_H__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_H__ diff --git a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu index 9cfeeebb1..895b95dfd 100644 --- a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu +++ b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cu @@ -1,8 +1,8 @@ -#include "multi_margin_loss_nvidia.cuh" -#include "../cuda/kernel.cuh" #include "../../../handle.h" -#include +#include "../cuda/kernel.cuh" +#include "multi_margin_loss_nvidia.cuh" #include +#include namespace op::multi_margin_loss::nvidia { template @@ -15,12 +15,12 @@ static inline bool is_aligned(const void *ptr, size_t alignment) { // ================================================================== template void launch_kernel( - void *output, - const void *input, - const void *target, + void *output, + const void *input, + const void *target, const void *weight, - void* workspace, - const MultiMarginLossInfo& info, + void *workspace, + const MultiMarginLossInfo &info, void *stream) { // 1. 准备指针 @@ -30,14 +30,14 @@ void launch_kernel( auto tar_ptr = reinterpret_cast(target); // Weight 是可选的 auto w_ptr = (weight != nullptr) ? reinterpret_cast(weight) : nullptr; - + auto cuda_stream = reinterpret_cast(stream); - + // 2. 准备参数 size_t N = info.batch_size(); size_t C = info.num_classes(); int reduction = info.reduction(); - + op::multi_margin_loss::cuda::MultiMarginLossFunctor functor(info.p(), info.margin()); // ------------------------------------------ @@ -47,28 +47,26 @@ void launch_kernel( // 每个线程处理一个样本 N size_t block_size = 256; size_t grid_size = (N + block_size - 1) / block_size; - + op::multi_margin_loss::cuda::multi_margin_loss_kernel <<>>( - out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor - ); - } + out_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor); + } // ------------------------------------------ // 模式 2: Reduction (Mean / Sum) // ------------------------------------------ else { // 使用 workspace 作为临时的 float 累加器 (精度更高,且方便 atomicAdd) - float* acc_ptr = reinterpret_cast(workspace); + float *acc_ptr = reinterpret_cast(workspace); cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream); float scale = (reduction == 1) ? (1.0f / static_cast(N)) : 1.0f; // 1=Mean, 2=Sum - + size_t block_size = 256; size_t grid_size = std::min((N + block_size - 1) / block_size, static_cast(1024)); op::multi_margin_loss::cuda::multi_margin_loss_reduce_kernel <<>>( - acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale - ); + acc_ptr, in_ptr, tar_ptr, w_ptr, N, C, functor, scale); op::multi_margin_loss::cuda::cast_float_to_t <<<1, 1, 0, cuda_stream>>>(out_ptr, acc_ptr); } @@ -79,22 +77,26 @@ void launch_kernel( // ================================================================== struct Descriptor::Opaque {}; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t target_desc, infiniopTensorDescriptor_t weight_desc, - int p, - float margin, + int p, + float margin, int reduction) { auto info_result = MultiMarginLossInfo::create(out_desc, input_desc, target_desc, weight_desc, p, margin, reduction); - if (!info_result) return info_result.status(); + if (!info_result) { + return info_result.status(); + } size_t workspace_size = 0; if (reduction != 0) { workspace_size = sizeof(float); @@ -105,11 +107,11 @@ infiniStatus_t Descriptor::create( } infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *target, + const void *input, + const void *target, const void *weight, void *stream) const { @@ -141,4 +143,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::multi_margin_loss::nvidia \ No newline at end of file +} // namespace op::multi_margin_loss::nvidia diff --git a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh index 81e20fa53..721681e4b 100644 --- a/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh +++ b/src/infiniop/ops/multi_margin_loss/nvidia/multi_margin_loss_nvidia.cuh @@ -4,4 +4,4 @@ #include "../multi_margin_loss.h" DESCRIPTOR(nvidia) -#endif // __MULTI_MARGIN_LOSS_NVIDIA_CUH__ \ No newline at end of file +#endif // __MULTI_MARGIN_LOSS_NVIDIA_CUH__ diff --git a/src/infiniop/ops/multi_margin_loss/operator.cc b/src/infiniop/ops/multi_margin_loss/operator.cc index a277f2415..ea4946b92 100644 --- a/src/infiniop/ops/multi_margin_loss/operator.cc +++ b/src/infiniop/ops/multi_margin_loss/operator.cc @@ -23,7 +23,7 @@ extern "C" { // ======================================================================= // 1. 创建算子描述符 // ======================================================================= -__C infiniStatus_t infiniopCreateMultiMarginLossDescriptor( +__INFINI_C infiniStatus_t infiniopCreateMultiMarginLossDescriptor( infiniopHandle_t handle, infiniopMultiMarginLossDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output, @@ -34,83 +34,83 @@ __C infiniStatus_t infiniopCreateMultiMarginLossDescriptor( float margin, int reduction) { - #define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::multi_margin_loss::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output, \ - input, \ - target, \ - weight, \ - p, \ - margin, \ - reduction) +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::multi_margin_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + target, \ + weight, \ + p, \ + margin, \ + reduction) switch (handle->device) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CREATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CREATE +#undef CREATE } // ======================================================================= // 2. 获取 Workspace 大小 // ======================================================================= -__C infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetMultiMarginLossWorkspaceSize(infiniopMultiMarginLossDescriptor_t desc, size_t *size) { - #define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API GET(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef GET +#undef GET } // ======================================================================= // 3. 执行计算 (Calculate) // ======================================================================= -__C infiniStatus_t infiniopMultiMarginLoss( +__INFINI_C infiniStatus_t infiniopMultiMarginLoss( infiniopMultiMarginLossDescriptor_t desc, void *workspace, size_t workspace_size, @@ -120,65 +120,65 @@ __C infiniStatus_t infiniopMultiMarginLoss( const void *weight, void *stream) { - #define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, input, target, weight, stream) +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, target, weight, stream) switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CALCULATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CALCULATE +#undef CALCULATE } -__C infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyMultiMarginLossDescriptor(infiniopMultiMarginLossDescriptor_t desc) { - #define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API DELETE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef DELETE +#undef DELETE } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc index e3e893e38..52dfbdf1b 100644 --- a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc @@ -2,9 +2,9 @@ #include "../../../devices/cpu/common_cpu.h" #include #include -#include #include // for memcpy #include +#include #include "../../../../utils.h" #include "../../../../utils/custom_types.h" @@ -14,33 +14,39 @@ struct ScatterCpuOpaque { std::vector updates_shape; std::vector updates_strides; std::vector output_strides; - std::vector indices_strides; - size_t input_total_bytes; - - ScatterCpuOpaque(const infiniopTensorDescriptor_t upd, + std::vector indices_strides; + size_t input_total_bytes; + + ScatterCpuOpaque(const infiniopTensorDescriptor_t upd, const infiniopTensorDescriptor_t indices, const infiniopTensorDescriptor_t out) { // 1. 几何信息 - const auto& u_shape = upd->shape(); + const auto &u_shape = upd->shape(); updates_shape.assign(u_shape.begin(), u_shape.end()); - const auto& u_strides = upd->strides(); + const auto &u_strides = upd->strides(); updates_strides.assign(u_strides.begin(), u_strides.end()); - const auto& i_strides = indices->strides(); + const auto &i_strides = indices->strides(); indices_strides.assign(i_strides.begin(), i_strides.end()); // <--- 记录 indices strides - const auto& o_strides = out->strides(); + const auto &o_strides = out->strides(); output_strides.assign(o_strides.begin(), o_strides.end()); - + size_t total_elements = 1; - for (auto s : out->shape()) total_elements *= s; - + for (auto s : out->shape()) { + total_elements *= s; + } + size_t dtype_size = 0; - if (out->dtype() == INFINI_DTYPE_F32) dtype_size = 4; - else if (out->dtype() == INFINI_DTYPE_F64) dtype_size = 8; - else dtype_size = 2; // f16/bf16 - + if (out->dtype() == INFINI_DTYPE_F32) { + dtype_size = 4; + } else if (out->dtype() == INFINI_DTYPE_F64) { + dtype_size = 8; + } else { + dtype_size = 2; // f16/bf16 + } + input_total_bytes = total_elements * dtype_size; } }; @@ -50,7 +56,10 @@ struct Descriptor::Opaque : public ScatterCpuOpaque { }; Descriptor::~Descriptor() { - if (_opaque) { delete _opaque; _opaque = nullptr; } + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } } infiniStatus_t Descriptor::create( @@ -74,14 +83,14 @@ infiniStatus_t Descriptor::create( return INFINI_STATUS_SUCCESS; } -inline void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { +inline void offset_to_coords(int64_t offset, int ndim, const int64_t *shape, int64_t *coords) { for (int i = ndim - 1; i >= 0; --i) { coords[i] = offset % shape[i]; offset /= shape[i]; } } -inline int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { +inline int64_t coords_to_offset(int ndim, const int64_t *coords, const int64_t *strides) { int64_t offset = 0; for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; @@ -100,18 +109,20 @@ void calculate_cpu_kernel( int axis = info.axis(); int reduction = info.reduction(); size_t ndim = info.ndim(); - - T* out_ptr = reinterpret_cast(output); - const IdxT* idx_ptr = reinterpret_cast(indices); - const T* upd_ptr = reinterpret_cast(updates); - const int64_t* upd_shape_ptr = opaque->updates_shape.data(); - const int64_t* upd_strides_ptr = opaque->updates_strides.data(); - const int64_t* idx_strides_ptr = opaque->indices_strides.data(); // <--- 使用 indices strides - const int64_t* out_strides_ptr = opaque->output_strides.data(); + T *out_ptr = reinterpret_cast(output); + const IdxT *idx_ptr = reinterpret_cast(indices); + const T *upd_ptr = reinterpret_cast(updates); + + const int64_t *upd_shape_ptr = opaque->updates_shape.data(); + const int64_t *upd_strides_ptr = opaque->updates_strides.data(); + const int64_t *idx_strides_ptr = opaque->indices_strides.data(); // <--- 使用 indices strides + const int64_t *out_strides_ptr = opaque->output_strides.data(); size_t total_elements = 1; - for (auto s : opaque->updates_shape) total_elements *= s; + for (auto s : opaque->updates_shape) { + total_elements *= s; + } // Serial loop for (size_t i = 0; i < total_elements; ++i) { @@ -120,7 +131,7 @@ void calculate_cpu_kernel( int64_t upd_offset = coords_to_offset(ndim, coords.data(), upd_strides_ptr); int64_t idx_offset = coords_to_offset(ndim, coords.data(), idx_strides_ptr); - + T upd_val = upd_ptr[upd_offset]; IdxT idx_val = idx_ptr[idx_offset]; @@ -128,13 +139,13 @@ void calculate_cpu_kernel( int64_t out_offset = coords_to_offset(ndim, coords.data(), out_strides_ptr); - if (reduction == 0) { + if (reduction == 0) { out_ptr[out_offset] = upd_val; - } else if (reduction == 1) { + } else if (reduction == 1) { float val_out = utils::cast(out_ptr[out_offset]); float val_upd = utils::cast(upd_val); out_ptr[out_offset] = utils::cast(val_out + val_upd); - } else if (reduction == 2) { + } else if (reduction == 2) { float val_out = utils::cast(out_ptr[out_offset]); float val_upd = utils::cast(upd_val); out_ptr[out_offset] = utils::cast(val_out * val_upd); @@ -150,7 +161,7 @@ void calculate_cpu_impl( const void *input, // 需要 input 指针 const void *indices, const void *updates) { - + if (input != output) { std::memcpy(output, input, opaque->input_total_bytes); } @@ -192,4 +203,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::scatter::cpu \ No newline at end of file +} // namespace op::scatter::cpu diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h index 6f77c4b8f..ad52c7b91 100644 --- a/src/infiniop/ops/scatter/cpu/scatter_cpu.h +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h @@ -5,4 +5,4 @@ DESCRIPTOR(cpu) -#endif // __SCATTER_CPU_H__ \ No newline at end of file +#endif // __SCATTER_CPU_H__ diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh index 2fe22bc89..a64ad370d 100644 --- a/src/infiniop/ops/scatter/cuda/kernel.cuh +++ b/src/infiniop/ops/scatter/cuda/kernel.cuh @@ -1,15 +1,14 @@ #ifndef __SCATTER_CUDA_CUH__ #define __SCATTER_CUDA_CUH__ -#include -#include #include +#include +#include using nv_bfloat16 = __nv_bfloat16; - #include -#include #include +#include namespace op::scatter::cuda { @@ -20,28 +19,31 @@ struct TensorGeometry { int64_t updates_shape[MAX_DIMS]; int64_t updates_strides[MAX_DIMS]; int64_t output_strides[MAX_DIMS]; - int64_t indices_strides[MAX_DIMS]; + int64_t indices_strides[MAX_DIMS]; }; __device__ __forceinline__ float to_float(float val) { return val; } __device__ __forceinline__ float to_float(double val) { return static_cast(val); } __device__ __forceinline__ float to_float(half val) { return __half2float(val); } __device__ __forceinline__ float to_float(nv_bfloat16 val) { return __bfloat162float(val); } -template __device__ __forceinline__ T from_float(float val) { return static_cast(val); } -template <> __device__ __forceinline__ half from_float(float val) { return __float2half(val); } -template <> __device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } +template +__device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> +__device__ __forceinline__ half from_float(float val) { return __float2half(val); } +template <> +__device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } -__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { - #pragma unroll +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t *shape, int64_t *coords) { +#pragma unroll for (int i = ndim - 1; i >= 0; --i) { coords[i] = offset % shape[i]; offset /= shape[i]; } } -__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t *coords, const int64_t *strides) { int64_t offset = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; } @@ -50,14 +52,14 @@ __device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coo template __global__ void scatter_kernel( - T * __restrict__ output, - const T * __restrict__ updates, - const IdxT * __restrict__ indices, + T *__restrict__ output, + const T *__restrict__ updates, + const IdxT *__restrict__ indices, TensorGeometry geometry, int axis, int reduction, size_t num_updates) { - + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = blockDim.x * gridDim.x; @@ -68,7 +70,7 @@ __global__ void scatter_kernel( int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); T upd_val = updates[upd_offset]; - + // FIX: 使用 indices_strides 计算 offset int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); IdxT idx_val = indices[idx_offset]; @@ -92,4 +94,4 @@ __global__ void scatter_kernel( } // namespace op::scatter::cuda -#endif // __SCATTER_CUDA_CUH__ \ No newline at end of file +#endif // __SCATTER_CUDA_CUH__ diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h index d0347107c..76dcf985f 100644 --- a/src/infiniop/ops/scatter/info.h +++ b/src/infiniop/ops/scatter/info.h @@ -47,16 +47,16 @@ class ScatterInfo { return INFINI_STATUS_BAD_PARAM; } - const auto& in_shape = input_desc->shape(); - const auto& out_shape = out_desc->shape(); + const auto &in_shape = input_desc->shape(); + const auto &out_shape = out_desc->shape(); for (size_t i = 0; i < ndim; ++i) { if (in_shape[i] != out_shape[i]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; } } - const auto& idx_shape = indices_desc->shape(); - const auto& upd_shape = updates_desc->shape(); + const auto &idx_shape = indices_desc->shape(); + const auto &upd_shape = updates_desc->shape(); for (size_t i = 0; i < ndim; ++i) { if (idx_shape[i] != upd_shape[i]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -64,13 +64,12 @@ class ScatterInfo { } for (size_t i = 0; i < ndim; ++i) { - if (idx_shape[i] > in_shape[i]) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; - } + if (idx_shape[i] > in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } } - if (input_desc->dtype() != updates_desc->dtype() || - input_desc->dtype() != out_desc->dtype()) { + if (input_desc->dtype() != updates_desc->dtype() || input_desc->dtype() != out_desc->dtype()) { return INFINI_STATUS_BAD_TENSOR_DTYPE; } @@ -79,7 +78,7 @@ class ScatterInfo { } if (reduction < 0 || reduction > 2) { - return INFINI_STATUS_BAD_PARAM; + return INFINI_STATUS_BAD_PARAM; } return utils::Result(ScatterInfo{ @@ -87,11 +86,10 @@ class ScatterInfo { indices_desc->dtype(), canonical_axis, reduction, - ndim - }); + ndim}); } }; } // namespace op::scatter -#endif // __SCATTER_INFO_H__ \ No newline at end of file +#endif // __SCATTER_INFO_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h index 9ebfae3b2..772289d75 100644 --- a/src/infiniop/ops/scatter/metax/scatter_metax.h +++ b/src/infiniop/ops/scatter/metax/scatter_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif // __SCATTER_METAX_API_H__ \ No newline at end of file +#endif // __SCATTER_METAX_API_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca index 04b8e30e8..bcb9fddfb 100644 --- a/src/infiniop/ops/scatter/metax/scatter_metax.maca +++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca @@ -1,14 +1,14 @@ -#include "scatter_metax.h" #include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_handle.h" -#include -#include +#include "scatter_metax.h" #include -#include #include +#include #include -#include #include +#include +#include +#include using nv_bfloat16 = __maca_bfloat16; namespace op::scatter::metax { @@ -29,22 +29,25 @@ __device__ __forceinline__ float to_float(double val) { return static_cast __device__ __forceinline__ T from_float(float val) { return static_cast(val); } -template <> __device__ __forceinline__ __half from_float<__half>(float val) { return __float2half(val); } -template <> __device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } +template +__device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> +__device__ __forceinline__ __half from_float<__half>(float val) { return __float2half(val); } +template <> +__device__ __forceinline__ nv_bfloat16 from_float(float val) { return __float2bfloat16(val); } // 坐标变换辅助函数 -__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { - #pragma unroll +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t *shape, int64_t *coords) { +#pragma unroll for (int i = ndim - 1; i >= 0; --i) { coords[i] = offset % shape[i]; offset /= shape[i]; } } -__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t *coords, const int64_t *strides) { int64_t offset = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; } @@ -54,14 +57,14 @@ __device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coo // Scatter Kernel template __global__ void scatter_kernel( - T * __restrict__ output, - const T * __restrict__ updates, - const IdxT * __restrict__ indices, + T *__restrict__ output, + const T *__restrict__ updates, + const IdxT *__restrict__ indices, TensorGeometry geometry, int axis, int reduction, size_t num_updates) { - + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = blockDim.x * gridDim.x; @@ -74,7 +77,7 @@ __global__ void scatter_kernel( // 2. 读取 update 值 int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); T upd_val = updates[upd_offset]; - + // 3. 读取 index 值 (注意:使用 indices_strides) int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); IdxT idx_val = indices[idx_offset]; @@ -105,32 +108,36 @@ struct ScatterMetaxOpaque { TensorGeometry geometry; size_t input_bytes; - ScatterMetaxOpaque(const infiniopTensorDescriptor_t updates_desc, + ScatterMetaxOpaque(const infiniopTensorDescriptor_t updates_desc, const infiniopTensorDescriptor_t indices_desc, const infiniopTensorDescriptor_t output_desc) { - + geometry.ndim = static_cast(updates_desc->ndim()); - + // 计算 Input 字节数 (用于拷贝) size_t total_elements = 1; - for(size_t i=0; indim(); ++i) { + for (size_t i = 0; i < output_desc->ndim(); ++i) { total_elements *= output_desc->shape()[i]; } - - size_t dt_size = 0; - if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; - else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; - else dt_size = 2; // f16/bf16 - + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) { + dt_size = 4; + } else if (output_desc->dtype() == INFINI_DTYPE_F64) { + dt_size = 8; + } else { + dt_size = 2; // f16/bf16 + } + input_bytes = total_elements * dt_size; - + // 填充 Geometry int ndim = geometry.ndim; - for(int i=0; ishape()[i]; geometry.updates_strides[i] = updates_desc->strides()[i]; geometry.output_strides[i] = output_desc->strides()[i]; - geometry.indices_strides[i] = indices_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; } } }; @@ -139,8 +146,10 @@ struct Descriptor::Opaque : public ScatterMetaxOpaque { using ScatterMetaxOpaque::ScatterMetaxOpaque; }; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } // ================================================================== @@ -148,40 +157,41 @@ Descriptor::~Descriptor() { // ================================================================== template void launch_kernel( - void *output, - const void *updates, + void *output, + const void *updates, const void *indices, - const ScatterMetaxOpaque* opaque, - const ScatterInfo& info, + const ScatterMetaxOpaque *opaque, + const ScatterInfo &info, void *stream) { auto out_ptr = reinterpret_cast(output); auto upd_ptr = reinterpret_cast(updates); auto idx_ptr = reinterpret_cast(indices); auto mc_stream = reinterpret_cast(stream); - + size_t num_updates = 1; - for(int i=0; igeometry.ndim; ++i) { + for (int i = 0; i < opaque->geometry.ndim; ++i) { num_updates *= opaque->geometry.updates_shape[i]; } - - if (num_updates == 0) return; + + if (num_updates == 0) { + return; + } size_t block_size = 256; size_t grid_size = (num_updates + block_size - 1) / block_size; // 限制 grid size,防止溢出 - grid_size = std::min(grid_size, static_cast(2147483647)); + grid_size = std::min(grid_size, static_cast(2147483647)); scatter_kernel <<>>( - out_ptr, - upd_ptr, - idx_ptr, - opaque->geometry, - info.axis(), - info.reduction(), - num_updates - ); + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates); } // ================================================================== @@ -189,19 +199,21 @@ void launch_kernel( // ================================================================== infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t indices_desc, infiniopTensorDescriptor_t updates_desc, - int axis, + int axis, int reduction) { auto handle_ptr = reinterpret_cast(handle); auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); - if (!info_result) return info_result.status(); - + if (!info_result) { + return info_result.status(); + } + if (out_desc->ndim() > MAX_DIMS) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; + return INFINI_STATUS_BAD_TENSOR_SHAPE; } auto opaque = new Opaque(updates_desc, indices_desc, out_desc); @@ -215,11 +227,11 @@ infiniStatus_t Descriptor::create( // 6. Calculate Dispatch // ================================================================== infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *indices, + const void *input, + const void *indices, const void *updates, void *stream) const { @@ -276,4 +288,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::scatter::metax \ No newline at end of file +} // namespace op::scatter::metax diff --git a/src/infiniop/ops/scatter/moore/scatter_moore.h b/src/infiniop/ops/scatter/moore/scatter_moore.h index e09580c4f..aa55745f6 100644 --- a/src/infiniop/ops/scatter/moore/scatter_moore.h +++ b/src/infiniop/ops/scatter/moore/scatter_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif // __SCATTER_MOORE_H__ \ No newline at end of file +#endif // __SCATTER_MOORE_H__ diff --git a/src/infiniop/ops/scatter/moore/scatter_moore.mu b/src/infiniop/ops/scatter/moore/scatter_moore.mu index 82bb7ee60..833c0c3d8 100644 --- a/src/infiniop/ops/scatter/moore/scatter_moore.mu +++ b/src/infiniop/ops/scatter/moore/scatter_moore.mu @@ -1,8 +1,8 @@ +#include "../../../devices/moore/moore_handle.h" #include "scatter_moore.h" #include "scatter_moore_kernel.h" -#include "../../../devices/moore/moore_handle.h" -#include #include +#include #include namespace op::scatter::moore { @@ -14,32 +14,36 @@ struct ScatterMooreOpaque { op::scatter::moore::TensorGeometry geometry; size_t input_bytes; - ScatterMooreOpaque(const infiniopTensorDescriptor_t updates_desc, - const infiniopTensorDescriptor_t indices_desc, - const infiniopTensorDescriptor_t output_desc) { - + ScatterMooreOpaque(const infiniopTensorDescriptor_t updates_desc, + const infiniopTensorDescriptor_t indices_desc, + const infiniopTensorDescriptor_t output_desc) { + geometry.ndim = static_cast(updates_desc->ndim()); - + // Calculate Input bytes for copy size_t total_elements = 1; - for(size_t i=0; indim(); ++i) { + for (size_t i = 0; i < output_desc->ndim(); ++i) { total_elements *= output_desc->shape()[i]; } - - size_t dt_size = 0; - if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; - else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; - else dt_size = 2; // f16/bf16 - + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) { + dt_size = 4; + } else if (output_desc->dtype() == INFINI_DTYPE_F64) { + dt_size = 8; + } else { + dt_size = 2; // f16/bf16 + } + input_bytes = total_elements * dt_size; - + // Fill Geometry int ndim = geometry.ndim; - for(int i=0; ishape()[i]; geometry.updates_strides[i] = updates_desc->strides()[i]; geometry.output_strides[i] = output_desc->strides()[i]; - geometry.indices_strides[i] = indices_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; } } }; @@ -48,8 +52,10 @@ struct Descriptor::Opaque : public ScatterMooreOpaque { using ScatterMooreOpaque::ScatterMooreOpaque; }; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } // ================================================================== @@ -57,40 +63,41 @@ Descriptor::~Descriptor() { // ================================================================== template void launch_kernel( - void *output, - const void *updates, + void *output, + const void *updates, const void *indices, - const ScatterMooreOpaque* opaque, - const ScatterInfo& info, + const ScatterMooreOpaque *opaque, + const ScatterInfo &info, void *stream) { auto out_ptr = reinterpret_cast(output); auto upd_ptr = reinterpret_cast(updates); auto idx_ptr = reinterpret_cast(indices); auto musa_stream = reinterpret_cast(stream); - + size_t num_updates = 1; - for(int i=0; igeometry.ndim; ++i) { + for (int i = 0; i < opaque->geometry.ndim; ++i) { num_updates *= opaque->geometry.updates_shape[i]; } - - if (num_updates == 0) return; + + if (num_updates == 0) { + return; + } size_t block_size = 256; size_t grid_size = (num_updates + block_size - 1) / block_size; // MUSA grid dimension limit check (usually same as CUDA) - grid_size = std::min(grid_size, static_cast(2147483647)); + grid_size = std::min(grid_size, static_cast(2147483647)); op::scatter::moore::scatter_kernel <<>>( - out_ptr, - upd_ptr, - idx_ptr, - opaque->geometry, - info.axis(), - info.reduction(), - num_updates - ); + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates); } // ================================================================== @@ -98,19 +105,21 @@ void launch_kernel( // ================================================================== infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t indices_desc, infiniopTensorDescriptor_t updates_desc, - int axis, + int axis, int reduction) { auto handle = reinterpret_cast(handle_); auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); - if (!info_result) return info_result.status(); - + if (!info_result) { + return info_result.status(); + } + if (out_desc->ndim() > op::scatter::moore::MAX_DIMS) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; + return INFINI_STATUS_BAD_TENSOR_SHAPE; } auto opaque = new Opaque(updates_desc, indices_desc, out_desc); @@ -124,11 +133,11 @@ infiniStatus_t Descriptor::create( // Calculate Dispatch // ================================================================== infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *indices, + const void *input, + const void *indices, const void *updates, void *stream) const { @@ -183,4 +192,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::scatter::moore \ No newline at end of file +} // namespace op::scatter::moore diff --git a/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h b/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h index e346c5164..3b3bb271f 100644 --- a/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h +++ b/src/infiniop/ops/scatter/moore/scatter_moore_kernel.h @@ -1,13 +1,13 @@ #ifndef __SCATTER_MOORE_KERNEL_H__ #define __SCATTER_MOORE_KERNEL_H__ -#include -#include #include +#include +#include #include -#include #include +#include namespace op::scatter::moore { @@ -25,25 +25,28 @@ __device__ __forceinline__ float to_float(double val) { return static_cast __device__ __forceinline__ T from_float(float val) { return static_cast(val); } -template <> __device__ __forceinline__ half from_float(float val) { return __float2half(val); } -template <> __device__ __forceinline__ __mt_bfloat16 from_float<__mt_bfloat16>(float val) { return __float2bfloat16(val); } +template +__device__ __forceinline__ T from_float(float val) { return static_cast(val); } +template <> +__device__ __forceinline__ half from_float(float val) { return __float2half(val); } +template <> +__device__ __forceinline__ __mt_bfloat16 from_float<__mt_bfloat16>(float val) { return __float2bfloat16(val); } // ================================================================== // 坐标/偏移计算逻辑 (保持不变) // ================================================================== -__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t* shape, int64_t* coords) { - #pragma unroll +__device__ __forceinline__ void offset_to_coords(int64_t offset, int ndim, const int64_t *shape, int64_t *coords) { +#pragma unroll for (int i = ndim - 1; i >= 0; --i) { coords[i] = offset % shape[i]; offset /= shape[i]; } } -__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coords, const int64_t* strides) { +__device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t *coords, const int64_t *strides) { int64_t offset = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < ndim; ++i) { offset += coords[i] * strides[i]; } @@ -56,14 +59,14 @@ __device__ __forceinline__ int64_t coords_to_offset(int ndim, const int64_t* coo template __global__ void scatter_kernel( - T * __restrict__ output, - const T * __restrict__ updates, - const IdxT * __restrict__ indices, + T *__restrict__ output, + const T *__restrict__ updates, + const IdxT *__restrict__ indices, TensorGeometry geometry, int axis, int reduction, size_t num_updates) { - + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = blockDim.x * gridDim.x; @@ -76,7 +79,7 @@ __global__ void scatter_kernel( // 2. 获取 updates 中的值 int64_t upd_offset = coords_to_offset(geometry.ndim, coords, geometry.updates_strides); T upd_val = updates[upd_offset]; - + // 3. 获取对应的 indices 值 (使用 indices_strides) int64_t idx_offset = coords_to_offset(geometry.ndim, coords, geometry.indices_strides); IdxT idx_val = indices[idx_offset]; @@ -100,4 +103,4 @@ __global__ void scatter_kernel( } // namespace op::scatter::moore -#endif // __SCATTER_MOORE_KERNEL_H__ \ No newline at end of file +#endif // __SCATTER_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu index 6d8836de7..011a0f2bf 100644 --- a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu @@ -1,8 +1,8 @@ -#include "scatter_nvidia.cuh" -#include "../cuda/kernel.cuh" #include "../../../handle.h" -#include +#include "../cuda/kernel.cuh" +#include "scatter_nvidia.cuh" #include +#include #include namespace op::scatter::nvidia { @@ -14,32 +14,36 @@ struct ScatterNvidiaOpaque { op::scatter::cuda::TensorGeometry geometry; size_t input_bytes; - ScatterNvidiaOpaque(const infiniopTensorDescriptor_t updates_desc, + ScatterNvidiaOpaque(const infiniopTensorDescriptor_t updates_desc, const infiniopTensorDescriptor_t indices_desc, const infiniopTensorDescriptor_t output_desc) { - + geometry.ndim = static_cast(updates_desc->ndim()); - + // 计算 Input 字节数 size_t total_elements = 1; - for(size_t i=0; indim(); ++i) { + for (size_t i = 0; i < output_desc->ndim(); ++i) { total_elements *= output_desc->shape()[i]; } - - size_t dt_size = 0; - if (output_desc->dtype() == INFINI_DTYPE_F32) dt_size = 4; - else if (output_desc->dtype() == INFINI_DTYPE_F64) dt_size = 8; - else dt_size = 2; // f16/bf16 - + + size_t dt_size = 0; + if (output_desc->dtype() == INFINI_DTYPE_F32) { + dt_size = 4; + } else if (output_desc->dtype() == INFINI_DTYPE_F64) { + dt_size = 8; + } else { + dt_size = 2; // f16/bf16 + } + input_bytes = total_elements * dt_size; - + // 填充 Geometry int ndim = geometry.ndim; - for(int i=0; ishape()[i]; geometry.updates_strides[i] = updates_desc->strides()[i]; geometry.output_strides[i] = output_desc->strides()[i]; - geometry.indices_strides[i] = indices_desc->strides()[i]; + geometry.indices_strides[i] = indices_desc->strides()[i]; } } }; @@ -48,8 +52,10 @@ struct Descriptor::Opaque : public ScatterNvidiaOpaque { using ScatterNvidiaOpaque::ScatterNvidiaOpaque; }; -Descriptor::~Descriptor() { - if (_opaque) delete _opaque; +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } } // ================================================================== @@ -57,39 +63,40 @@ Descriptor::~Descriptor() { // ================================================================== template void launch_kernel( - void *output, - const void *updates, + void *output, + const void *updates, const void *indices, - const ScatterNvidiaOpaque* opaque, - const ScatterInfo& info, + const ScatterNvidiaOpaque *opaque, + const ScatterInfo &info, void *stream) { auto out_ptr = reinterpret_cast(output); auto upd_ptr = reinterpret_cast(updates); auto idx_ptr = reinterpret_cast(indices); auto cuda_stream = reinterpret_cast(stream); - + size_t num_updates = 1; - for(int i=0; igeometry.ndim; ++i) { + for (int i = 0; i < opaque->geometry.ndim; ++i) { num_updates *= opaque->geometry.updates_shape[i]; } - - if (num_updates == 0) return; + + if (num_updates == 0) { + return; + } size_t block_size = 256; size_t grid_size = (num_updates + block_size - 1) / block_size; - grid_size = std::min(grid_size, static_cast(2147483647)); + grid_size = std::min(grid_size, static_cast(2147483647)); op::scatter::cuda::scatter_kernel <<>>( - out_ptr, - upd_ptr, - idx_ptr, - opaque->geometry, - info.axis(), - info.reduction(), - num_updates - ); + out_ptr, + upd_ptr, + idx_ptr, + opaque->geometry, + info.axis(), + info.reduction(), + num_updates); } // ================================================================== @@ -97,18 +104,20 @@ void launch_kernel( // ================================================================== infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t indices_desc, infiniopTensorDescriptor_t updates_desc, - int axis, + int axis, int reduction) { auto info_result = ScatterInfo::create(out_desc, input_desc, indices_desc, updates_desc, axis, reduction); - if (!info_result) return info_result.status(); - + if (!info_result) { + return info_result.status(); + } + if (out_desc->ndim() > op::scatter::cuda::MAX_DIMS) { - return INFINI_STATUS_BAD_TENSOR_SHAPE; + return INFINI_STATUS_BAD_TENSOR_SHAPE; } // 传入 indices_desc @@ -123,11 +132,11 @@ infiniStatus_t Descriptor::create( // Calculate Dispatch // ================================================================== infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, + void *workspace, + size_t workspace_size, void *output, - const void *input, - const void *indices, + const void *input, + const void *indices, const void *updates, void *stream) const { @@ -182,4 +191,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::scatter::nvidia \ No newline at end of file +} // namespace op::scatter::nvidia diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh index 448321cb2..4e133bd99 100644 --- a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh @@ -5,4 +5,4 @@ DESCRIPTOR(nvidia) -#endif // __SCATTER_NVIDIA_CUH__ \ No newline at end of file +#endif // __SCATTER_NVIDIA_CUH__ diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc index 4236100b0..de9a10e64 100644 --- a/src/infiniop/ops/scatter/operator.cc +++ b/src/infiniop/ops/scatter/operator.cc @@ -23,7 +23,7 @@ extern "C" { // ======================================================================= // 1. 创建算子描述符 // ======================================================================= -__C infiniStatus_t infiniopCreateScatterDescriptor( +__INFINI_C infiniStatus_t infiniopCreateScatterDescriptor( infiniopHandle_t handle, infiniopScatterDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output, @@ -33,82 +33,82 @@ __C infiniStatus_t infiniopCreateScatterDescriptor( int axis, int reduction) { - #define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::scatter::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output, \ - input, \ - indices, \ - updates, \ - axis, \ - reduction) +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::scatter::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + indices, \ + updates, \ + axis, \ + reduction) switch (handle->device) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CREATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CREATE +#undef CREATE } // ======================================================================= // 2. 获取 Workspace 大小 // ======================================================================= -__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { - #define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API GET(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef GET +#undef GET } // ======================================================================= // 3. 执行计算 (Calculate) // ======================================================================= -__C infiniStatus_t infiniopScatter( +__INFINI_C infiniStatus_t infiniopScatter( infiniopScatterDescriptor_t desc, void *workspace, size_t workspace_size, @@ -118,69 +118,69 @@ __C infiniStatus_t infiniopScatter( const void *updates, void *stream) { - #define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, input, indices, updates, stream) +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, indices, updates, stream) switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API CALCULATE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CALCULATE +#undef CALCULATE } // ======================================================================= // 4. 销毁描述符 // ======================================================================= -__C infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { - #define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS switch (desc->device_type) { - #ifdef ENABLE_CPU_API +#ifdef ENABLE_CPU_API DELETE(INFINI_DEVICE_CPU, cpu); - #endif - #ifdef ENABLE_NVIDIA_API +#endif +#ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); - #endif - #ifdef ENABLE_ILUVATAR_API +#endif +#ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); - #endif - #ifdef ENABLE_QY_API +#endif +#ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); - #endif - #ifdef ENABLE_METAX_API +#endif +#ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); - #endif - #ifdef ENABLE_MOORE_API +#endif +#ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); - #endif +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef DELETE +#undef DELETE } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h index 8cf6c239d..f662f112d 100644 --- a/src/infiniop/ops/scatter/scatter.h +++ b/src/infiniop/ops/scatter/scatter.h @@ -2,52 +2,52 @@ #define __SCATTER_H__ #include "../../operator.h" -#include "info.h" +#include "info.h" // 宏定义:用于生成不同命名空间下的 Descriptor 类 -#define DESCRIPTOR(NAMESPACE) \ - namespace op::scatter::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - ScatterInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - ScatterInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t output, \ - infiniopTensorDescriptor_t input, \ - infiniopTensorDescriptor_t indices, \ - infiniopTensorDescriptor_t updates, \ - int axis, \ - int reduction); \ - \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - const void *input, \ - const void *indices, \ - const void *updates, \ - void *stream) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + namespace op::scatter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ScatterInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ScatterInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output, \ + infiniopTensorDescriptor_t input, \ + infiniopTensorDescriptor_t indices, \ + infiniopTensorDescriptor_t updates, \ + int axis, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *indices, \ + const void *updates, \ + void *stream) const; \ + }; \ } -#endif // __SCATTER_H__ \ No newline at end of file +#endif // __SCATTER_H__