From fc2242ed583b2368f39973d7fcc095fdee29d67d Mon Sep 17 00:00:00 2001 From: yudeng <2272776615@qq.com> Date: Wed, 18 Mar 2026 07:11:23 +0000 Subject: [PATCH 1/2] issue/1031 merge T1-1-8 --- include/infinicore/ops.hpp | 2 + .../infinicore/ops/adaptive_avg_pool3d.hpp | 16 ++ include/infinicore/ops/addr.hpp | 16 ++ include/infinicore/ops/argwhere.hpp | 14 ++ include/infinicore/ops/asin.hpp | 18 ++ include/infinicore/ops/fmin.hpp | 18 ++ include/infiniop.h | 5 +- include/infiniop/ops/adaptive_avg_pool3d.h | 30 +++ include/infiniop/ops/addr.h | 30 +++ include/infiniop/ops/argwhere.h | 29 +++ include/infiniop/ops/asin.h | 24 ++ include/infiniop/ops/fmin.h | 28 +++ python/infinicore/__init__.py | 9 + python/infinicore/nn/functional/__init__.py | 3 + .../nn/functional/adaptive_avg_pool3d.py | 16 ++ python/infinicore/ops/addr.py | 28 +++ python/infinicore/ops/argwhere.py | 6 + python/infinicore/ops/asin.py | 11 + python/infinicore/ops/fmin.py | 10 + .../adaptive_avg_pool3d.cc | 36 +++ .../adaptive_avg_pool3d_infiniop.cc | 62 +++++ src/infinicore/ops/addr/addr.cc | 32 +++ src/infinicore/ops/addr/addr_infiniop.cc | 53 +++++ src/infinicore/ops/argwhere/argwhere.cc | 33 +++ .../ops/argwhere/argwhere_infiniop.cc | 52 ++++ src/infinicore/ops/asin/asin.cc | 36 +++ src/infinicore/ops/asin/asin_infiniop.cc | 52 ++++ src/infinicore/ops/fmin/fmin.cc | 27 +++ src/infinicore/ops/fmin/fmin_infiniop.cc | 52 ++++ src/infinicore/pybind11/ops.hpp | 12 +- .../pybind11/ops/adaptive_avg_pool3d.hpp | 22 ++ src/infinicore/pybind11/ops/addr.hpp | 29 +++ src/infinicore/pybind11/ops/argwhere.hpp | 13 + src/infinicore/pybind11/ops/asin.hpp | 24 ++ src/infinicore/pybind11/ops/fmin.hpp | 26 ++ .../adaptive_avg_pool3d/adaptive_avg_pool3d.h | 90 +++++++ .../cpu/adaptive_avg_pool3d_cpu.cc | 113 +++++++++ .../cpu/adaptive_avg_pool3d_cpu.h | 8 + .../ops/adaptive_avg_pool3d/cuda/kernel.cuh | 44 ++++ .../metax/adaptive_avg_pool3d_metax.h | 8 + .../metax/adaptive_avg_pool3d_metax.maca | 88 +++++++ .../moore/adaptive_avg_pool3d_kernel.h | 42 ++++ .../moore/adaptive_avg_pool3d_moore.h | 7 + .../moore/adaptive_avg_pool3d_moore.mu | 89 +++++++ .../nvidia/adaptive_avg_pool3d_nvidia.cu | 93 ++++++++ .../nvidia/adaptive_avg_pool3d_nvidia.cuh | 8 + .../ops/adaptive_avg_pool3d/operator.cc | 216 +++++++++++++++++ src/infiniop/ops/addr/addr.h | 95 ++++++++ src/infiniop/ops/addr/cpu/addr_cpu.cc | 93 ++++++++ src/infiniop/ops/addr/cpu/addr_cpu.h | 7 + src/infiniop/ops/addr/cuda/kernel.cu | 38 +++ src/infiniop/ops/addr/metax/addr_metax.h | 8 + src/infiniop/ops/addr/metax/addr_nvidia.maca | 108 +++++++++ src/infiniop/ops/addr/moore/addr_kernel.h | 36 +++ src/infiniop/ops/addr/moore/addr_moore.h | 8 + src/infiniop/ops/addr/moore/addr_moore.mu | 108 +++++++++ src/infiniop/ops/addr/nvidia/addr_nvidia.cu | 114 +++++++++ src/infiniop/ops/addr/nvidia/addr_nvidia.cuh | 7 + src/infiniop/ops/addr/operator.cc | 222 ++++++++++++++++++ src/infiniop/ops/argwhere/argwhere.h | 75 ++++++ src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc | 80 +++++++ src/infiniop/ops/argwhere/cpu/argwhere_cpu.h | 8 + src/infiniop/ops/argwhere/cuda/kernel.cuh | 215 +++++++++++++++++ .../ops/argwhere/metax/argwhere_metax.h | 8 + .../ops/argwhere/metax/argwhere_metax.maca | 94 ++++++++ .../ops/argwhere/moore/argwhere_kernel.h | 91 +++++++ .../ops/argwhere/moore/argwhere_moore.h | 8 + .../ops/argwhere/moore/argwhere_moore.mu | 107 +++++++++ .../ops/argwhere/nvidia/argwhere_nvidia.cu | 107 +++++++++ .../ops/argwhere/nvidia/argwhere_nvidia.cuh | 8 + src/infiniop/ops/argwhere/operator.cc | 200 ++++++++++++++++ src/infiniop/ops/asin/cpu/asin_cpu.cc | 52 ++++ src/infiniop/ops/asin/cpu/asin_cpu.h | 21 ++ src/infiniop/ops/asin/cuda/kernel.cuh | 42 ++++ src/infiniop/ops/asin/metax/asin_metax.h | 8 + src/infiniop/ops/asin/metax/asin_metax.maca | 61 +++++ src/infiniop/ops/asin/moore/asin_moore.h | 8 + src/infiniop/ops/asin/moore/asin_moore.mu | 60 +++++ .../ops/asin/moore/asin_moore_kernel.h | 42 ++++ src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 61 +++++ src/infiniop/ops/asin/nvidia/asin_nvidia.cuh | 8 + src/infiniop/ops/asin/operator.cc | 157 +++++++++++++ src/infiniop/ops/fmin/cpu/fmin_cpu.cc | 58 +++++ src/infiniop/ops/fmin/cpu/fmin_cpu.h | 26 ++ src/infiniop/ops/fmin/cuda/kernel.cuh | 33 +++ src/infiniop/ops/fmin/metax/fmin_metax.h | 8 + src/infiniop/ops/fmin/metax/fmin_metax.maca | 69 ++++++ src/infiniop/ops/fmin/moore/fmin_moore.h | 8 + src/infiniop/ops/fmin/moore/fmin_moore.mu | 66 ++++++ .../ops/fmin/moore/fmin_moore_kernel.h | 27 +++ src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu | 66 ++++++ src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh | 8 + src/infiniop/ops/fmin/operator.cc | 202 ++++++++++++++++ test/infinicore/ops/adaptive_avg_pool3d.py | 6 +- test/infinicore/ops/addr.py | 6 +- test/infinicore/ops/argwhere.py | 6 +- test/infinicore/ops/asin.py | 6 +- test/infinicore/ops/fmin.py | 6 +- 98 files changed, 4629 insertions(+), 17 deletions(-) create mode 100644 include/infinicore/ops/adaptive_avg_pool3d.hpp create mode 100644 include/infinicore/ops/addr.hpp create mode 100644 include/infinicore/ops/argwhere.hpp create mode 100644 include/infinicore/ops/asin.hpp create mode 100644 include/infinicore/ops/fmin.hpp create mode 100644 include/infiniop/ops/adaptive_avg_pool3d.h create mode 100644 include/infiniop/ops/addr.h create mode 100644 include/infiniop/ops/argwhere.h create mode 100644 include/infiniop/ops/asin.h create mode 100644 include/infiniop/ops/fmin.h create mode 100644 python/infinicore/nn/functional/adaptive_avg_pool3d.py create mode 100644 python/infinicore/ops/addr.py create mode 100644 python/infinicore/ops/argwhere.py create mode 100644 python/infinicore/ops/asin.py create mode 100644 python/infinicore/ops/fmin.py create mode 100644 src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc create mode 100644 src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc create mode 100644 src/infinicore/ops/addr/addr.cc create mode 100644 src/infinicore/ops/addr/addr_infiniop.cc create mode 100644 src/infinicore/ops/argwhere/argwhere.cc create mode 100644 src/infinicore/ops/argwhere/argwhere_infiniop.cc create mode 100644 src/infinicore/ops/asin/asin.cc create mode 100644 src/infinicore/ops/asin/asin_infiniop.cc create mode 100644 src/infinicore/ops/fmin/fmin.cc create mode 100644 src/infinicore/ops/fmin/fmin_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp create mode 100644 src/infinicore/pybind11/ops/addr.hpp create mode 100644 src/infinicore/pybind11/ops/argwhere.hpp create mode 100644 src/infinicore/pybind11/ops/asin.hpp create mode 100644 src/infinicore/pybind11/ops/fmin.hpp create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh create mode 100644 src/infiniop/ops/adaptive_avg_pool3d/operator.cc create mode 100644 src/infiniop/ops/addr/addr.h create mode 100644 src/infiniop/ops/addr/cpu/addr_cpu.cc create mode 100644 src/infiniop/ops/addr/cpu/addr_cpu.h create mode 100644 src/infiniop/ops/addr/cuda/kernel.cu create mode 100644 src/infiniop/ops/addr/metax/addr_metax.h create mode 100644 src/infiniop/ops/addr/metax/addr_nvidia.maca create mode 100644 src/infiniop/ops/addr/moore/addr_kernel.h create mode 100644 src/infiniop/ops/addr/moore/addr_moore.h create mode 100644 src/infiniop/ops/addr/moore/addr_moore.mu create mode 100644 src/infiniop/ops/addr/nvidia/addr_nvidia.cu create mode 100644 src/infiniop/ops/addr/nvidia/addr_nvidia.cuh create mode 100644 src/infiniop/ops/addr/operator.cc create mode 100644 src/infiniop/ops/argwhere/argwhere.h create mode 100644 src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc create mode 100644 src/infiniop/ops/argwhere/cpu/argwhere_cpu.h create mode 100644 src/infiniop/ops/argwhere/cuda/kernel.cuh create mode 100644 src/infiniop/ops/argwhere/metax/argwhere_metax.h create mode 100644 src/infiniop/ops/argwhere/metax/argwhere_metax.maca create mode 100644 src/infiniop/ops/argwhere/moore/argwhere_kernel.h create mode 100644 src/infiniop/ops/argwhere/moore/argwhere_moore.h create mode 100644 src/infiniop/ops/argwhere/moore/argwhere_moore.mu create mode 100644 src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu create mode 100644 src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh create mode 100644 src/infiniop/ops/argwhere/operator.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.h create mode 100644 src/infiniop/ops/asin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asin/metax/asin_metax.h create mode 100644 src/infiniop/ops/asin/metax/asin_metax.maca create mode 100644 src/infiniop/ops/asin/moore/asin_moore.h create mode 100644 src/infiniop/ops/asin/moore/asin_moore.mu create mode 100644 src/infiniop/ops/asin/moore/asin_moore_kernel.h create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cu create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh create mode 100644 src/infiniop/ops/asin/operator.cc create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.cc create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.h create mode 100644 src/infiniop/ops/fmin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/fmin/metax/fmin_metax.h create mode 100644 src/infiniop/ops/fmin/metax/fmin_metax.maca create mode 100644 src/infiniop/ops/fmin/moore/fmin_moore.h create mode 100644 src/infiniop/ops/fmin/moore/fmin_moore.mu create mode 100644 src/infiniop/ops/fmin/moore/fmin_moore_kernel.h create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh create mode 100644 src/infiniop/ops/fmin/operator.cc diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 78329a6cd..e9af0357e 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -2,6 +2,7 @@ #include "ops/adaptive_max_pool1d.hpp" #include "ops/add.hpp" +#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/addcmul.hpp" #include "ops/asinh.hpp" @@ -20,6 +21,7 @@ #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" +#include "ops/fmin.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" #include "ops/paged_attention.hpp" diff --git a/include/infinicore/ops/adaptive_avg_pool3d.hpp b/include/infinicore/ops/adaptive_avg_pool3d.hpp new file mode 100644 index 000000000..ab1dc9253 --- /dev/null +++ b/include/infinicore/ops/adaptive_avg_pool3d.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class AdaptiveAvgPool3D { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; + +Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size); +void adaptive_avg_pool3d_(Tensor y, Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/addr.hpp b/include/infinicore/ops/addr.hpp new file mode 100644 index 000000000..92d97fae8 --- /dev/null +++ b/include/infinicore/ops/addr.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Addr { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float, float); + static void execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); + static common::OpDispatcher &dispatcher(); +}; + +Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); +void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/argwhere.hpp b/include/infinicore/ops/argwhere.hpp new file mode 100644 index 000000000..f0c629adb --- /dev/null +++ b/include/infinicore/ops/argwhere.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Argwhere { +public: + using schema = void (*)(void **, size_t *, Tensor); + static void execute(void **, size_t *count, Tensor x); + static common::OpDispatcher &dispatcher(); +}; +Tensor argwhere(Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/asin.hpp b/include/infinicore/ops/asin.hpp new file mode 100644 index 000000000..e8667b037 --- /dev/null +++ b/include/infinicore/ops/asin.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Asin { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor asin(Tensor input); +void asin_(Tensor output, Tensor input); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/fmin.hpp b/include/infinicore/ops/fmin.hpp new file mode 100644 index 000000000..4ea00787f --- /dev/null +++ b/include/infinicore/ops/fmin.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Fmin { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor fmin(Tensor a, Tensor b); +void fmin_(Tensor c, Tensor a, Tensor b); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 2b30ff4ae..ada498304 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -3,12 +3,15 @@ #include "infiniop/handle.h" #include "infiniop/ops/adaptive_max_pool1d.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" #include "infiniop/ops/add.h" +#include "infiniop/ops/addr.h" #include "infiniop/ops/add_rms_norm.h" #include "infiniop/ops/addcmul.h" #include "infiniop/ops/all.h" #include "infiniop/ops/asinh.h" #include "infiniop/ops/atanh.h" +#include "infiniop/ops/asin.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/avg_pool1d.h" #include "infiniop/ops/binary_cross_entropy_with_logits.h" @@ -19,11 +22,11 @@ #include "infiniop/ops/cross_entropy.h" #include "infiniop/ops/dequant/per_tensor_dequant_int8.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" #include "infiniop/ops/flash_attention.h" #include "infiniop/ops/fmod.h" -#include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/hardswish.h" diff --git a/include/infiniop/ops/adaptive_avg_pool3d.h b/include/infiniop/ops/adaptive_avg_pool3d.h new file mode 100644 index 000000000..178a509e1 --- /dev/null +++ b/include/infiniop/ops/adaptive_avg_pool3d.h @@ -0,0 +1,30 @@ +#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_H_ +#define INFINIOP_ADAPTIVE_AVG_POOL3D_H_ + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopAdaptiveAvgPool3DDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t *output_size); + +__C __export infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopAdaptiveAvgPool3D( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/addr.h b/include/infiniop/ops/addr.h new file mode 100644 index 000000000..6cae92d97 --- /dev/null +++ b/include/infiniop/ops/addr.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_ADDR_API_H__ +#define __INFINIOP_ADDR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAddrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAddrDescriptor(infiniopHandle_t handle, + infiniopAddrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t vec1, + infiniopTensorDescriptor_t vec2, + float beta, + float alpha); + +__C __export infiniStatus_t infiniopGetAddrWorkspaceSize(infiniopAddrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAddr(infiniopAddrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/argwhere.h b/include/infiniop/ops/argwhere.h new file mode 100644 index 000000000..653cbba94 --- /dev/null +++ b/include/infiniop/ops/argwhere.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_ARGWHERE_API_H__ +#define __INFINIOP_ARGWHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopArgwhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateArgwhereDescriptor( + infiniopHandle_t handle, + infiniopArgwhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc); + +__C __export infiniStatus_t infiniopGetArgwhereWorkspaceSize( + infiniopArgwhereDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopArgwhere( + infiniopArgwhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **output, + size_t *count, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyArgwhereDescriptor( + infiniopArgwhereDescriptor_t desc); + +#endif // __INFINIOP_ARGWHERE_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h new file mode 100644 index 000000000..ee40514d0 --- /dev/null +++ b/include/infiniop/ops/asin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASIN_API_H__ +#define __INFINIOP_ASIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/fmin.h b/include/infiniop/ops/fmin.h new file mode 100644 index 000000000..274e06a46 --- /dev/null +++ b/include/infiniop/ops/fmin.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_FMIN_H__ +#define __INFINIOP_FMIN_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFminDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFminDescriptor(infiniopHandle_t handle, + infiniopFminDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFmin(infiniopFminDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index a895b0426..06ee97825 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -53,6 +53,9 @@ from infinicore.ops.all import all from infinicore.ops.asinh import asinh from infinicore.ops.atanh import atanh +from infinicore.ops.addr import addr +from infinicore.ops.asin import asin +from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_ from infinicore.ops.attention import attention from infinicore.ops.baddbmm import baddbmm from infinicore.ops.bilinear import bilinear @@ -74,6 +77,8 @@ from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal +from infinicore.ops.argwhere import argwhere +from infinicore.ops.fmin import fmin from infinicore.ops.squeeze import squeeze from infinicore.ops.sum import sum from infinicore.ops.topk import topk @@ -143,8 +148,11 @@ "cdist", "reciprocal", "add", + "addr", "add_rms_norm", "add_rms_norm_", + "argwhere", + "asin", "attention", "kv_caching", "asinh", @@ -167,6 +175,7 @@ "from_torch", "mha_kvcache", "mha_varlen", + "fmin", "paged_caching", "paged_attention", "paged_attention_prefill", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 815eb8a2b..7bea371e7 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -14,6 +14,8 @@ from .silu import silu from .silu_and_mul import silu_and_mul from .swiglu import swiglu +from .adaptive_avg_pool3d import adaptive_avg_pool3d + __all__ = [ "adaptive_max_pool1d", @@ -33,4 +35,5 @@ "swiglu", "linear_w8a8i8", "silu_and_mul", + "adaptive_avg_pool3d", ] diff --git a/python/infinicore/nn/functional/adaptive_avg_pool3d.py b/python/infinicore/nn/functional/adaptive_avg_pool3d.py new file mode 100644 index 000000000..9f8a3a4a6 --- /dev/null +++ b/python/infinicore/nn/functional/adaptive_avg_pool3d.py @@ -0,0 +1,16 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor +from typing import List + + +def adaptive_avg_pool3d(x: Tensor, output_size: List[int] = {1, 1, 1}) -> Tensor: + r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes. + + Args: + x (Tensor): The input tensor of shape (N, C, D, H, W) + output_size (List[int]): The target output size of the form (d, h, w) + + Returns: + Tensor: The pooled output tensor + """ + return Tensor(_infinicore.adaptive_avg_pool3d(x._underlying, output_size)) diff --git a/python/infinicore/ops/addr.py b/python/infinicore/ops/addr.py new file mode 100644 index 000000000..f9a607961 --- /dev/null +++ b/python/infinicore/ops/addr.py @@ -0,0 +1,28 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def addr( + input: Tensor, + vec1: Tensor, + vec2: Tensor, + beta: float = 1.0, + alpha: float = 1.0, + out=None, +) -> Tensor: + if out is None: + return Tensor( + _infinicore.addr( + input._underlying, vec1._underlying, vec2._underlying, beta, alpha + ) + ) + + _infinicore.addr_( + out._underlying, + input._underlying, + vec1._underlying, + vec2._underlying, + beta, + alpha, + ) + return out diff --git a/python/infinicore/ops/argwhere.py b/python/infinicore/ops/argwhere.py new file mode 100644 index 000000000..1e558c898 --- /dev/null +++ b/python/infinicore/ops/argwhere.py @@ -0,0 +1,6 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def argwhere(x: Tensor) -> Tensor: + return Tensor(_infinicore.argwhere(x._underlying)) diff --git a/python/infinicore/ops/asin.py b/python/infinicore/ops/asin.py new file mode 100644 index 000000000..fbb230737 --- /dev/null +++ b/python/infinicore/ops/asin.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def asin(input: Tensor, *, out=None): + """Arcsin activation function.""" + if out is None: + return Tensor(_infinicore.asin(input._underlying)) + + _infinicore.asin_(out._underlying, input._underlying) + return out diff --git a/python/infinicore/ops/fmin.py b/python/infinicore/ops/fmin.py new file mode 100644 index 000000000..56e274a10 --- /dev/null +++ b/python/infinicore/ops/fmin.py @@ -0,0 +1,10 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def fmin(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.fmin(input._underlying, other._underlying)) + + _infinicore.fmin_(out._underlying, input._underlying, other._underlying) + return out diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc new file mode 100644 index 000000000..61d2cecfa --- /dev/null +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc @@ -0,0 +1,36 @@ +#include "infinicore/ops/adaptive_avg_pool3d.hpp" +#include +#include +namespace infinicore::op { + +common::OpDispatcher &AdaptiveAvgPool3D::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void AdaptiveAvgPool3D::execute(Tensor y, Tensor x) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error("No AdaptiveAvgPool3D implementation found for device type: " + std::to_string(static_cast(device_type))); + } + func(y, x); +} + +Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size) { + + // Create output tensor shap + Shape y_shape = x->shape(); + y_shape[2] = output_size[0]; // D dimension + y_shape[3] = output_size[1]; // H dimension + y_shape[4] = output_size[2]; // W dimension + + auto y = Tensor::empty(y_shape, x->dtype(), x->device()); + adaptive_avg_pool3d_(y, x); + return y; +} + +void adaptive_avg_pool3d_(Tensor y, Tensor x) { + AdaptiveAvgPool3D::execute(y, x); +} +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc new file mode 100644 index 000000000..1f4ef3211 --- /dev/null +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc @@ -0,0 +1,62 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/adaptive_avg_pool3d.hpp" +#include "infinicore/ops/common/cache.hpp" + +namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAdaptiveAvgPool3DDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveAvgPool3DDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAdaptiveAvgPool3DDescriptor_t desc = nullptr; + + if (!desc_opt) { + // Convert vector to array for output_size + std::vector output_size_vec = {output->size(2), output->size(3), output->size(4)}; + + INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveAvgPool3DDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + output_size_vec.data())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // Get workspace size and allocate if needed + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAdaptiveAvgPool3DWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR( + infiniopAdaptiveAvgPool3D( + desc, + workspace->data(), workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + AdaptiveAvgPool3D::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/addr/addr.cc b/src/infinicore/ops/addr/addr.cc new file mode 100644 index 000000000..af7493b6f --- /dev/null +++ b/src/infinicore/ops/addr/addr.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/addr.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Addr::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Addr::execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, vec1, vec2); + infinicore::context::setDevice(out->device()); + dispatcher().lookup(out->device().getType())(out, input, vec1, vec2, beta, alpha); +} + +Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + + size_t n = vec1->shape()[0]; + size_t m = vec2->shape()[0]; + + // Create output tensor + Tensor out = Tensor::empty({n, m}, input->dtype(), input->device()); + addr_(out, input, vec1, vec2, beta, alpha); + return out; +} + +void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + Addr::execute(out, input, vec1, vec2, beta, alpha); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/addr/addr_infiniop.cc b/src/infinicore/ops/addr/addr_infiniop.cc new file mode 100644 index 000000000..a215a94bf --- /dev/null +++ b/src/infinicore/ops/addr/addr_infiniop.cc @@ -0,0 +1,53 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/addr.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::addr_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAddrDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAddrDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + // Hash the inputs including beta and alpha + size_t seed = hash_combine(out, input, vec1, vec2, beta, alpha); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAddrDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAddrDescriptor( + context::getInfiniopHandle(out->device()), &desc, + out->desc(), input->desc(), vec1->desc(), vec2->desc(), beta, alpha)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAddrWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAddr( + desc, workspace->data(), workspace_size, + out->data(), input->data(), vec1->data(), vec2->data(), context::getStream())); +} + +static bool registered = []() { + Addr::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::addr_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/argwhere/argwhere.cc b/src/infinicore/ops/argwhere/argwhere.cc new file mode 100644 index 000000000..a9d45de48 --- /dev/null +++ b/src/infinicore/ops/argwhere/argwhere.cc @@ -0,0 +1,33 @@ +#include "infinicore/ops/argwhere.hpp" +#include "../../utils.hpp" +#include "infinicore.h" +#include "infinicore/dtype.hpp" +#include "infinicore/tensor.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Argwhere::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Argwhere::execute(void **y, size_t *count, Tensor x) { + auto device_type = context::getDevice().getType(); + + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error("Argwhere op not implemented for device type " + std::to_string(static_cast(device_type))); + } + func(y, count, x); +} +Tensor argwhere(Tensor x) { + void *y = nullptr; + size_t count = 0; + Argwhere::execute(&y, &count, x); + auto result = Tensor::from_blob(y, Shape{count, x->ndim()}, DataType::I64, Device::cpu()); + result = result->to(x->device()); + return result; +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/argwhere/argwhere_infiniop.cc b/src/infinicore/ops/argwhere/argwhere_infiniop.cc new file mode 100644 index 000000000..bc6c919db --- /dev/null +++ b/src/infinicore/ops/argwhere/argwhere_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/argwhere.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infiniop/ops/argwhere.h" + +namespace infinicore::op::argwhere_impl::infiniop { +thread_local common::OpCache caches( + 100, // capacity + [](infiniopArgwhereDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyArgwhereDescriptor(desc)); + desc = nullptr; + } + }); +void calculate(void **y, size_t *count, Tensor x) { + size_t seed = hash_combine(x); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopArgwhereDescriptor_t desc = nullptr; + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateArgwhereDescriptor( + context::getInfiniopHandle(x->device()), + &desc, + x->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetArgwhereWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopArgwhere( + desc, + workspace->data(), workspace_size, + y, + count, + x->data(), + context::getStream())); +} +static bool registered = []() { + Argwhere::dispatcher().registerAll(&calculate, false); + return true; +}(); +} // namespace infinicore::op::argwhere_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/asin/asin.cc b/src/infinicore/ops/asin/asin.cc new file mode 100644 index 000000000..f3a8efb7c --- /dev/null +++ b/src/infinicore/ops/asin/asin.cc @@ -0,0 +1,36 @@ +#include "infinicore/ops/asin.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Asin::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; +void Asin::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Asin implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor asin(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + asin_(output, input); + return output; +} + +void asin_(Tensor output, Tensor input) { + Asin::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/asin/asin_infiniop.cc b/src/infinicore/ops/asin/asin_infiniop.cc new file mode 100644 index 000000000..4dec4c220 --- /dev/null +++ b/src/infinicore/ops/asin/asin_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/asin.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::asin_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAsinDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAsinDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAsinDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAsinDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAsinWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAsin( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Asin::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::asin_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/fmin/fmin.cc b/src/infinicore/ops/fmin/fmin.cc new file mode 100644 index 000000000..54d056c63 --- /dev/null +++ b/src/infinicore/ops/fmin/fmin.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/fmin.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Fmin::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Fmin::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor fmin(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); + fmin_(c, a, b); + return c; +} + +void fmin_(Tensor c, Tensor a, Tensor b) { + Fmin::execute(c, a, b); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/fmin/fmin_infiniop.cc b/src/infinicore/ops/fmin/fmin_infiniop.cc new file mode 100644 index 000000000..c9560479a --- /dev/null +++ b/src/infinicore/ops/fmin/fmin_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/fmin.hpp" +#include + +namespace infinicore::op::fmin_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopFminDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyFminDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, b, a); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFminDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateFminDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetFminWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopFmin( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + Fmin::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::fmin_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index a468defd8..ec5fca334 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -3,7 +3,11 @@ #include #include "ops/adaptive_max_pool1d.hpp" +#include "ops/adaptive_avg_pool3d.hpp" #include "ops/add.hpp" +#include "ops/addr.hpp" +#include "ops/argwhere.hpp" +#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/addcmul.hpp" #include "ops/all.hpp" @@ -24,6 +28,7 @@ #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" +#include "ops/fmin.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" #include "ops/matmul.hpp" @@ -53,6 +58,10 @@ namespace infinicore::ops { inline void bind(py::module &m) { bind_adaptive_max_pool1d(m); bind_add(m); + bind_adaptive_avg_pool3d(m); + bind_argwhere(m); + bind_addr(m); + bind_asin(m); bind_add_rms_norm(m); bind_attention(m); bind_asinh(m); @@ -62,6 +71,7 @@ inline void bind(py::module &m) { bind_flash_attention(m); bind_kv_caching(m); bind_fmod(m); + bind_fmin(m); bind_random_sample(m); bind_linear(m); bind_matmul(m); @@ -97,4 +107,4 @@ inline void bind(py::module &m) { bind_reciprocal(m); } -} // namespace infinicore::ops +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp new file mode 100644 index 000000000..3b2b8fdc7 --- /dev/null +++ b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "infinicore/ops/adaptive_avg_pool3d.hpp" + +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_adaptive_avg_pool3d(py::module &m) { + m.def("adaptive_avg_pool3d", + &op::adaptive_avg_pool3d, + py::arg("x"), + py::arg("output_size"), + R"doc( Adaptive Average Pooling 3D.)doc"); + + m.def("adaptive_avg_pool3d_", + &op::adaptive_avg_pool3d_, + py::arg("y"), + py::arg("x"), + R"doc(In-place, Adaptive Average Pooling 3D.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/addr.hpp b/src/infinicore/pybind11/ops/addr.hpp new file mode 100644 index 000000000..2777fc513 --- /dev/null +++ b/src/infinicore/pybind11/ops/addr.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "infinicore/ops/addr.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_addr(py::module_ &m) { + m.def( + "addr", + &op::addr, + py::arg("input"), + py::arg("vec1"), + py::arg("vec2"), + py::arg("alpha"), + py::arg("beta"), + R"doc(Addr.)doc"); + m.def( + "addr_", + &op::addr_, + py::arg("out"), + py::arg("input"), + py::arg("vec1"), + py::arg("vec2"), + py::arg("beta"), + py::arg("alpha"), + R"doc(Addr.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/argwhere.hpp b/src/infinicore/pybind11/ops/argwhere.hpp new file mode 100644 index 000000000..0e6e4a4af --- /dev/null +++ b/src/infinicore/pybind11/ops/argwhere.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "infinicore/ops/argwhere.hpp" +#include +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_argwhere(py::module &m) { + m.def("argwhere", + &op::argwhere, + py::arg("x"), + R"doc(Argwhere.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/asin.hpp b/src/infinicore/pybind11/ops/asin.hpp new file mode 100644 index 000000000..ebd14adbc --- /dev/null +++ b/src/infinicore/pybind11/ops/asin.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/asin.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_asin(py::module &m) { + m.def("asin", + &op::asin, + py::arg("input"), + R"doc(Arcsin activation function.)doc"); + + m.def("asin_", + &op::asin_, + py::arg("output"), + py::arg("input"), + R"doc(In-place arcsin activation function.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/fmin.hpp b/src/infinicore/pybind11/ops/fmin.hpp new file mode 100644 index 000000000..afdbe4b33 --- /dev/null +++ b/src/infinicore/pybind11/ops/fmin.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/fmin.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_fmin(py::module &m) { + m.def("fmin", + &op::fmin, + py::arg("a"), + py::arg("b"), + R"doc(fmin of two tensors.)doc"); + + m.def("fmin_", + &op::fmin_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place tensor fmin.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h new file mode 100644 index 000000000..feea6e701 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h @@ -0,0 +1,90 @@ +#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_ +#define INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::adaptive_avg_pool3d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AdaptiveAvgPool3DInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AdaptiveAvgPool3DInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t *output_size); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class AdaptiveAvgPool3DInfo { +private: + AdaptiveAvgPool3DInfo() = default; + +public: + infiniDtype_t dtype; + size_t x_d, x_h, x_w; + size_t y_d, y_h, y_w; + size_t N, C; + std::vector x_strides; + std::vector y_strides; + + static utils::Result + create( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + CHECK_OR_RETURN(x_desc != nullptr && output_size != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = x_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + const size_t ndim = x_desc->ndim(); + CHECK_OR_RETURN(ndim == 5, INFINI_STATUS_BAD_TENSOR_SHAPE); + + return utils::Result(AdaptiveAvgPool3DInfo{ + data_type, + x_desc->dim(2), + x_desc->dim(3), + x_desc->dim(4), + output_size[0], + output_size[1], + output_size[2], + x_desc->dim(0), + x_desc->dim(1), + x_desc->strides(), + y_desc->strides()}); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc new file mode 100644 index 000000000..45b53a973 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc @@ -0,0 +1,113 @@ +#include "adaptive_avg_pool3d_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +namespace op::adaptive_avg_pool3d::cpu { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + + auto handle = reinterpret_cast(handle_); + + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAdaptiveAvgPool3D( + const AdaptiveAvgPool3DInfo &info, + Tdata *y, + const Tdata *x) { + std::array y_strides; + y_strides[size_t(4)] = 1; + y_strides[size_t(3)] = info.y_w * y_strides[size_t(4)]; + y_strides[size_t(2)] = info.y_h * y_strides[size_t(3)]; + y_strides[size_t(1)] = info.y_d * y_strides[size_t(2)]; + y_strides[size_t(0)] = info.C * y_strides[size_t(1)]; + { +#pragma omp for collapse(5) + for (ptrdiff_t n = 0; n < ptrdiff_t(info.N); n++) { + for (ptrdiff_t c = 0; c < ptrdiff_t(info.C); c++) { + for (ptrdiff_t od = 0; od < ptrdiff_t(info.y_d); od++) { + for (ptrdiff_t oh = 0; oh < ptrdiff_t(info.y_h); oh++) { + for (ptrdiff_t ow = 0; ow < ptrdiff_t(info.y_w); ow++) { + size_t x_start_d = od * info.x_d / info.y_d; + size_t x_end_d = ((od + 1) * info.x_d + info.y_d - 1) / info.y_d; + size_t x_start_h = oh * info.x_h / info.y_h; + size_t x_end_h = ((oh + 1) * info.x_h + info.y_h - 1) / info.y_h; + size_t x_start_w = ow * info.x_w / info.y_w; + size_t x_end_w = ((ow + 1) * info.x_w + info.y_w - 1) / info.y_w; + size_t count = (x_end_d - x_start_d) * (x_end_h - x_start_h) * (x_end_w - x_start_w); + + // Handle floating point types with casting + if constexpr (std::is_same::value || std::is_same::value) { + float sum = 0.0f; + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + sum += utils::cast(x[x_offset]); + } + } + } + size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_offset] = utils::cast(sum / static_cast(count)); + } else { + Tdata sum = (Tdata)0; + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + sum += x[x_offset]; + } + } + } + size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + // For integer types, we might want to handle division differently + y[y_offset] = sum / static_cast(count); + } + } + } + } + } + } + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_F32: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_F64: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_BF16: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::adaptive_avg_pool3d::cpu \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h new file mode 100644 index 000000000..94753979a --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__ +#define __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh new file mode 100644 index 000000000..a348c5e30 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh @@ -0,0 +1,44 @@ +#ifndef __ADAPTIVE_AVG_POOL3D_CUDA_H__ +#define __ADAPTIVE_AVG_POOL3D_CUDA_H__ + +#include + +template +__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, + size_t x_d, size_t x_h, size_t x_w, + size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides) { + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = N * C * y_d * y_h * y_w; + size_t n, c, od, oh, ow; + n = index / (C * y_d * y_h * y_w); + c = (index / (y_d * y_h * y_w)) % C; + od = (index / (y_h * y_w)) % y_d; + oh = (index / y_w) % y_h; + ow = index % y_w; + + if (index < total) { + size_t x_d_start = (od * x_d) / y_d; + size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d; + size_t x_h_start = (oh * x_h) / y_h; + size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h; + size_t x_w_start = (ow * x_w) / y_w; + size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w; + + T sum = static_cast(0); + size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start); + for (size_t id = x_d_start; id < x_d_end; ++id) { + for (size_t ih = x_h_start; ih < x_h_end; ++ih) { + for (size_t iw = x_w_start; iw < x_w_end; ++iw) { + size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4]; + sum += x[x_index]; + } + } + } + size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_index] = sum / static_cast(static_cast(count)); + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h new file mode 100644 index 000000000..9421aef35 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_ +#define INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca new file mode 100644 index 000000000..48794fe17 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca @@ -0,0 +1,88 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cuh" +#include "adaptive_avg_pool3d_metax.h" + +template +INFINIOP_METAX_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} + +namespace op::adaptive_avg_pool3d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, hcStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + hcMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); + hcMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::adaptive_avg_pool3d::metax \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h new file mode 100644 index 000000000..a99c02483 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h @@ -0,0 +1,42 @@ +#ifndef ADAPTIVE_AVG_POOL3D_KERNEL_H_ +#define ADAPTIVE_AVG_POOL3D_KERNEL_H_ + +template +__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, + size_t x_d, size_t x_h, size_t x_w, + size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides) { + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = N * C * y_d * y_h * y_w; + size_t n, c, od, oh, ow; + n = index / (C * y_d * y_h * y_w); + c = (index / (y_d * y_h * y_w)) % C; + od = (index / (y_h * y_w)) % y_d; + oh = (index / y_w) % y_h; + ow = index % y_w; + + if (index < total) { + size_t x_d_start = (od * x_d) / y_d; + size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d; + size_t x_h_start = (oh * x_h) / y_h; + size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h; + size_t x_w_start = (ow * x_w) / y_w; + size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w; + + T sum = static_cast(0); + size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start); + for (size_t id = x_d_start; id < x_d_end; ++id) { + for (size_t ih = x_h_start; ih < x_h_end; ++ih) { + for (size_t iw = x_w_start; iw < x_w_end; ++iw) { + size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4]; + sum += x[x_index]; + } + } + } + size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_index] = sum / static_cast(count); + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h new file mode 100644 index 000000000..37967bf16 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h @@ -0,0 +1,7 @@ +#ifndef ADAPTIVE_AVG_POOL3D_MOORE_H_ +#define ADAPTIVE_AVG_POOL3D_MOORE_H_ +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(moore) + +#endif // ADAPTIVE_AVG_POOL3D_MOORE_H_ diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu new file mode 100644 index 000000000..9dd2259bf --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu @@ -0,0 +1,89 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "adaptive_avg_pool3d_moore.h" +#include "adaptive_avg_pool3d_kernel.h" + +template +INFINIOP_MOORE_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} +namespace op::adaptive_avg_pool3d::moore { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, musaStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + musaMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, stream); + musaMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if(_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048){ + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + }else{ + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} +} \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu new file mode 100644 index 000000000..b522b2da3 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu @@ -0,0 +1,93 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "adaptive_avg_pool3d_nvidia.cuh" +#include + +template +INFINIOP_CUDA_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} + +namespace op::adaptive_avg_pool3d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, cudaStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + cudaMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::adaptive_avg_pool3d::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh new file mode 100644 index 000000000..973a0a54a --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_AVG_POOL3D_NVIDIA_H__ +#define __ADAPTIVE_AVG_POOL3D_NVIDIA_H__ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/operator.cc b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc new file mode 100644 index 000000000..738402862 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc @@ -0,0 +1,216 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/adaptive_avg_pool3d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/adaptive_avg_pool3d_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/adaptive_avg_pool3d_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/adaptive_avg_pool3d_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/adaptive_avg_pool3d_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/adaptive_avg_pool3d_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/adaptive_avg_pool3d_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t *output_size) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::adaptive_avg_pool3d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + output_size) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAdaptiveAvgPool3D( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream); + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } +#undef CALCULATE + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } +#undef DELETE + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/addr/addr.h b/src/infiniop/ops/addr/addr.h new file mode 100644 index 000000000..6cf5b20fc --- /dev/null +++ b/src/infiniop/ops/addr/addr.h @@ -0,0 +1,95 @@ +#ifndef INFINIOP_ADDR_DESCRIPTOR_H_ +#define INFINIOP_ADDR_DESCRIPTOR_H_ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/addr.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::addr::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AddrInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AddrInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t vec1_desc, \ + infiniopTensorDescriptor_t vec2_desc, \ + float beta, \ + float alpha); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *out, \ + const void *input, \ + const void *vec1, \ + const void *vec2, \ + void *stream) const; \ + }; \ + } + +struct AddrInfo { + infiniDtype_t dtype; + size_t vec1_size; + size_t vec2_size; + float beta; + float alpha; + ptrdiff_t input_stride0, input_stride1; + ptrdiff_t output_stride0, output_stride1; + ptrdiff_t vec1_stride; + ptrdiff_t vec2_stride; + + static utils::Result + create(infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta = 1.0f, float alpha = 1.0f) { + CHECK_OR_RETURN(input_desc->ndim() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(vec1_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(vec2_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(input_desc->dim(0) == vec1_desc->dim(0) && input_desc->dim(1) == vec2_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE); + const infiniDtype_t data_type = input_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + return utils::Result(AddrInfo{ + data_type, + vec1_desc->dim(0), + vec2_desc->dim(0), + beta, + alpha, + input_desc->stride(0), + input_desc->stride(1), + output_desc->stride(0), + output_desc->stride(1), + vec1_desc->stride(0), + vec2_desc->stride(0), + }); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc new file mode 100644 index 000000000..52339164c --- /dev/null +++ b/src/infiniop/ops/addr/cpu/addr_cpu.cc @@ -0,0 +1,93 @@ +#include "addr_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +namespace op::addr::cpu { +Descriptor::~Descriptor() = default; + +// Template function to handle different data types +template +infiniStatus_t addr_impl(Tdata *out, + const Tdata *input, + const Tdata *vec1, + const Tdata *vec2, + const AddrInfo &info, + void *workspace, + size_t workspace_size) { + size_t n = info.vec1_size; + size_t m = info.vec2_size; + float beta = info.beta; + float alpha = info.alpha; + +#pragma omp parallel for collapse(2) + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < m; ++j) { + if constexpr (std::is_same::value || std::is_same::value) { + float a = utils::cast(vec1[i * info.vec1_stride]); + float b = utils::cast(vec2[j * info.vec2_stride]); + float c = utils::cast(input[i * info.input_stride0 + j * info.input_stride1]); + out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); + } else { + float a = vec1[i * info.vec1_stride], b = vec2[j * info.vec2_stride], c = input[i * info.input_stride0 + j * info.input_stride1]; + out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); + } + } + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, + float alpha) { + + auto handle = reinterpret_cast(handle_); + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor(info.take(), 0, nullptr, + INFINI_DEVICE_CPU, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F32: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + break; + case INFINI_DTYPE_F16: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + case INFINI_DTYPE_BF16: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::cpu \ No newline at end of file diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.h b/src/infiniop/ops/addr/cpu/addr_cpu.h new file mode 100644 index 000000000..e606c1816 --- /dev/null +++ b/src/infiniop/ops/addr/cpu/addr_cpu.h @@ -0,0 +1,7 @@ +#ifndef INFINIOP_ADDR_CPU_H_ +#define INFINIOP_ADDR_CPU_H_ +#include "../addr.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/cuda/kernel.cu b/src/infiniop/ops/addr/cuda/kernel.cu new file mode 100644 index 000000000..b6af54e05 --- /dev/null +++ b/src/infiniop/ops/addr/cuda/kernel.cu @@ -0,0 +1,38 @@ +#ifndef __ADDR_CUDA_H__ +#define __ADDR_CUDA_H__ +#include +#include + +template +__device__ void addr_kernel(T *out, const T *input, const T *vec1, + const T *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= m) { + return; + } + size_t out_idx = i * out_stride_0 + j * out_stride_1; + size_t in_idx = i * in_stride_0 + j * in_stride_1; + size_t vec1_idx = i * stride1; + size_t vec2_idx = j * stride2; + T in_val = input[in_idx]; + T vec1_val = vec1[vec1_idx]; + T vec2_val = vec2[vec2_idx]; + T out_val; + if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)), + __hmul(__float2half(beta), in_val)); + } else if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2bfloat16(alpha)), + __hmul(__float2bfloat16(beta), in_val)); + } else { + out_val = beta * in_val + alpha * vec1_val * vec2_val; + } + + out[out_idx] = out_val; + __syncthreads(); +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/metax/addr_metax.h b/src/infiniop/ops/addr/metax/addr_metax.h new file mode 100644 index 000000000..9a8acdbe2 --- /dev/null +++ b/src/infiniop/ops/addr/metax/addr_metax.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_ADDR_METAX_H_ +#define INFINIOP_ADDR_METAX_H_ + +#include "../addr.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/metax/addr_nvidia.maca b/src/infiniop/ops/addr/metax/addr_nvidia.maca new file mode 100644 index 000000000..443d59cff --- /dev/null +++ b/src/infiniop/ops/addr/metax/addr_nvidia.maca @@ -0,0 +1,108 @@ + +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cu" +#include "addr_metax.h" +#include "infinicore.h" +#include + +template +INFINIOP_METAX_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = + AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t +launchKernel(infiniDtype_t dtype, void *output, const void *input, + const void *vec1, const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, hcStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<__hpcc_bfloat16><<>>( + (__hpcc_bfloat16 *)output, (__hpcc_bfloat16 *)input, + (__hpcc_bfloat16 *)vec1, (__hpcc_bfloat16 *)vec2, n, m, beta, alpha, + stride1, stride2, out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::metax \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_kernel.h b/src/infiniop/ops/addr/moore/addr_kernel.h new file mode 100644 index 000000000..49e83aced --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_kernel.h @@ -0,0 +1,36 @@ +#ifndef __ADDR_MOORE_H__ +#define __ADDR_MOORE_H__ + +template +__device__ void addr_kernel(T *out, const T *input, const T *vec1, + const T *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= m) { + return; + } + size_t out_idx = i * out_stride_0 + j * out_stride_1; + size_t in_idx = i * in_stride_0 + j * in_stride_1; + size_t vec1_idx = i * stride1; + size_t vec2_idx = j * stride2; + T in_val = input[in_idx]; + T vec1_val = vec1[vec1_idx]; + T vec2_val = vec2[vec2_idx]; + T out_val; + if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)), + __hmul(__float2half(beta), in_val)); + } else if constexpr (std::is_same_v) { + float a = __bfloat162float(vec1_val), b = __bfloat162float(vec2_val), in = __bfloat162float(in_val); + out_val = __float2bfloat16_rn(a * b * alpha + in * beta); + } else { + out_val = beta * in_val + alpha * vec1_val * vec2_val; + } + + out[out_idx] = out_val; + __syncthreads(); +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_moore.h b/src/infiniop/ops/addr/moore/addr_moore.h new file mode 100644 index 000000000..d32182597 --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_moore.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_ADDR_MOORE_H_ +#define INFINIOP_ADDR_MOORE_H_ + +#include "../addr.h" + +DESCRIPTOR(moore) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_moore.mu b/src/infiniop/ops/addr/moore/addr_moore.mu new file mode 100644 index 000000000..95626044b --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_moore.mu @@ -0,0 +1,108 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "addr_kernel.h" +#include "addr_moore.h" +#include "infinicore.h" +#include + +template +INFINIOP_MOORE_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = + AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(infiniDtype_t dtype, void *output, + const void *input, const void *vec1, + const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, + musaStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<<>>( + (cuda_bfloat16 *)output, (cuda_bfloat16 *)input, (cuda_bfloat16 *)vec1, + (cuda_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, + out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() ==MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::moore \ No newline at end of file diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cu b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu new file mode 100644 index 000000000..cbd5279ce --- /dev/null +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu @@ -0,0 +1,114 @@ + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cu" +#include "addr_nvidia.cuh" +#include "infinicore.h" +#include + +template +INFINIOP_CUDA_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(infiniDtype_t dtype, void *output, + const void *input, const void *vec1, + const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, + cudaStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<__nv_bfloat16><<>>( + (__nv_bfloat16 *)output, (__nv_bfloat16 *)input, (__nv_bfloat16 *)vec1, + (__nv_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, + out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh new file mode 100644 index 000000000..9ebe7f2c3 --- /dev/null +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef INFINIOP_ADDR_NVIDIA_H_ +#define INFINIOP_ADDR_NVIDIA_H_ +#include "../addr.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/operator.cc b/src/infiniop/ops/addr/operator.cc new file mode 100644 index 000000000..6e3724743 --- /dev/null +++ b/src/infiniop/ops/addr/operator.cc @@ -0,0 +1,222 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/addr.h" + +#ifdef ENABLE_CPU_API +#include "cpu/addr_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/addr_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/addr_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/addr_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAddrDescriptor( + infiniopHandle_t handle, + infiniopAddrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t vec1, + infiniopTensorDescriptor_t vec2, + float beta, + float alpha) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::addr::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out, \ + input, \ + vec1, \ + vec2, \ + beta, \ + alpha) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetAddrWorkspaceSize( + infiniopAddrDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAddr( + infiniopAddrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, input, vec1, vec2, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef DELETE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/argwhere.h b/src/infiniop/ops/argwhere/argwhere.h new file mode 100644 index 000000000..afb067576 --- /dev/null +++ b/src/infiniop/ops/argwhere/argwhere.h @@ -0,0 +1,75 @@ +#ifndef __INFINIOP_ARGWHERE_H__ +#define __INFINIOP_ARGWHERE_H__ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infinicore.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::argwhere::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ArgwhereInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + ArgwhereInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void **y, \ + size_t *count, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class ArgwhereInfo { +private: + ArgwhereInfo() = default; + +public: + infiniDtype_t dtype; + std::vector strides; + std::vector shapes; + size_t num_elements; + + static utils::Result + create( + infiniopTensorDescriptor_t x_desc) { + CHECK_OR_RETURN(x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = x_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F32); + + return utils::Result(ArgwhereInfo{ + data_type, + x_desc->strides(), + x_desc->shape(), + x_desc->numel()}); + } +}; +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc new file mode 100644 index 000000000..6a92047a2 --- /dev/null +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc @@ -0,0 +1,80 @@ +#include "argwhere_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "infinicore.h" +#include +#include +#include +namespace op::argwhere::cpu { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateArgWhere( + const ArgwhereInfo &info, + void *workspace, + size_t workspace_size, + void **y, + size_t *count, + const void *x) { + + const Tdata *x_data = reinterpret_cast(x); + // int64_t *y_data = reinterpret_cast(y); + std::vector positions; + // #pragma omp parallel for + for (size_t i = 0; i < info.num_elements; i++) { + size_t pos = 0, tem = i; + std::vector position(info.strides.size()); + for (int j = info.strides.size() - 1; j >= 0; j--) { + position[j] = tem % info.shapes[j]; + tem /= info.shapes[j]; + pos += position[j] * info.strides[j]; + } + if (fabs(x_data[pos] - 0.0f) > 1e-5) { + for (auto p : position) { + positions.push_back(p); + } + } + } + + *y = new int64_t[positions.size()]; + memcpy(*y, positions.data(), positions.size() * sizeof(int64_t)); + *count = positions.size() / info.strides.size(); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void **y, + size_t *count, + const void *x, + void *stream) const { + switch (_info.dtype) { + case INFINI_DTYPE_F32: + return calculateArgWhere(_info, workspace, workspace_size, y, count, x); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::cpu \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h new file mode 100644 index 000000000..1e6461e75 --- /dev/null +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_ARGWHERE_CPU_API_H__ +#define __INFINIOP_ARGWHERE_CPU_API_H__ + +#include "../argwhere.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cuda/kernel.cuh b/src/infiniop/ops/argwhere/cuda/kernel.cuh new file mode 100644 index 000000000..ff4aae126 --- /dev/null +++ b/src/infiniop/ops/argwhere/cuda/kernel.cuh @@ -0,0 +1,215 @@ +#ifndef ARGWHERE_NVIDIA_KERNEL_H +#define ARGWHERE_NVIDIA_KERNEL_H +#include +#include + +__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes, + size_t *pos) { + for (int i = ndim - 1; i >= 0; i--) { + pos[i] = index % shapes[i]; + index /= shapes[i]; + } +} +__device__ size_t pos2dest(size_t *pos, size_t ndim, const ptrdiff_t *strides) { + size_t dest = 0; + for (size_t i = 0; i < ndim; i++) { + dest += pos[i] * strides[i]; + } + return dest; +} + +template +__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, + size_t N, const size_t *shapes, const ptrdiff_t *strides, size_t ndim, size_t *count) { + extern __shared__ size_t tmp[]; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; // equals to length of tmp + + if (tid * 2 < N) { + Index2Pos(tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-5; + tmp[tid * 2] = !is_zero1; + } + if (tid * 2 + 1 < N) { + Index2Pos(tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-5; + tmp[tid * 2 + 1] = !is_zero2; + } + + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + + for (int d = 1; d < leaf_num; d *= 2) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + + // 写入最终结果 + if (!is_zero1 && tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid] * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid + 1] * ndim + i] = pos2[i]; + } + } + if (tid == blockDim.x - 1) { + // printf("blockIdxDim = %d\n", blockDim.x); + *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]); + // printf("finally: count = %d\n", tmp[leaf_num - 1]); + } +} + +template +__global__ void parallel_block_scan_kernel(size_t N, int64_t *pre_sum) { + // single block scan + extern __shared__ int64_t tmp[]; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; // equals to length of tmp + + tmp[tid * 2] = tid * 2 < N - 1 ? pre_sum[tid * 2] : 0; + tmp[tid * 2 + 1] = tid * 2 + 1 < N - 1 ? pre_sum[tid * 2 + 1] : 0; + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < leaf_num; d <<= 1) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + if (tid * 2 < N) { + pre_sum[tid * 2] = tmp[tid * 2]; + } + if (tid * 2 + 1 < N) { + pre_sum[tid * 2 + 1] = tmp[tid * 2 + 1]; + } +} + +template +__global__ void +parallel_large_argwhere_kernel(const T *data, int64_t *block_sum, + int64_t *results, size_t N, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim) { + // To be implemented for large N + extern __shared__ int64_t tmp_argwhere[]; + int bid = blockIdx.x, tid = threadIdx.x; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int block_offset = bid * blockDim.x * 2, leaf_num = blockDim.x * 2; + tmp_argwhere[2 * tid] = tmp_argwhere[2 * tid + 1] = 0; + if (block_offset + tid * 2 < N) { + Index2Pos(block_offset + tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-9; + tmp_argwhere[2 * tid] = !is_zero1; + } + if (block_offset + tid * 2 + 1 < N) { + Index2Pos(block_offset + tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-9; + tmp_argwhere[2 * tid + 1] = !is_zero2; + } + __syncthreads(); + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp_argwhere[bi] += tmp_argwhere[ai]; + } + offset *= 2; + __syncthreads(); + } + if (tid == 0) { + block_sum[bid] = tmp_argwhere[leaf_num - 1]; + // printf("tmp_argwhere[%d] = %lld\n", leaf_num - 1, + // tmp_argwhere[leaf_num - 1]); + // printf("block_sum[%d] = %lld\n", bid, block_sum[bid]); + tmp_argwhere[leaf_num - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < leaf_num; d <<= 1) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + int v = tmp_argwhere[ai]; + tmp_argwhere[ai] = tmp_argwhere[bi]; + tmp_argwhere[bi] += v; + } + __syncthreads(); + } + + if (!is_zero1 && block_offset + tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[(tmp_argwhere[2 * tid] + block_offset) * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && block_offset + tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[(tmp_argwhere[2 * tid + 1] + block_offset) * ndim + i] = pos2[i]; + } + } +} + +__global__ void add_block_offset_kernel(int64_t *results, int64_t *tmp, + int64_t *block_sums, size_t ndim) { + int bid = blockIdx.x, tid = threadIdx.x; + size_t block_offset = block_sums[bid], origin_offset = bid * blockDim.x * 2; + if (2 * tid < block_sums[bid + 1] - block_sums[bid]) { + for (int i = 0; i < ndim; i++) { + results[(block_offset + 2 * tid) * ndim + i] = tmp[(origin_offset + tid * 2) * ndim + i]; + } + } + if (2 * tid + 1 < block_sums[bid + 1] - block_sums[bid]) { + for (int i = 0; i < ndim; i++) { + results[(block_offset + 2 * tid + 1) * ndim + i] = tmp[(origin_offset + tid * 2 + 1) * ndim + i]; + } + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.h b/src/infiniop/ops/argwhere/metax/argwhere_metax.h new file mode 100644 index 000000000..4e85f870c --- /dev/null +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.h @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_METAX_H +#define ARGWHERE_METAX_H + +#include "../argwhere.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.maca b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca new file mode 100644 index 000000000..7f3c5fb4a --- /dev/null +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca @@ -0,0 +1,94 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cuh" +#include "argwhere_metax.h" +#include "infinicore.h" +#include +#include + +infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, + size_t M, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count) { + + if (dtype == INFINI_DTYPE_F32) { + parallel_block_argwhere_kernel<<<1, M / 2, M * sizeof(size_t)>>>( + (float *)data, results, N, shapes, strides, ndim, count); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::metax { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + sizeof(size_t); + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + hcStream_t hc_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *result = reinterpret_cast(shapes + ndim); + size_t *count_cuda = reinterpret_cast(result + _info.num_elements * ndim); + + hcMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), hcMemcpyHostToDevice, + hc_stream); + hcMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), hcMemcpyHostToDevice, + hc_stream); + // hcStreamSynchronize(hc_stream); + size_t M = nextPowerOfTwo(_info.num_elements); + CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, + ndim, INFINI_DTYPE_F32, count_cuda)); + // 从设备内存中读取 count_cuda 的值 + hcMemcpyAsync(count, count_cuda, sizeof(size_t), hcMemcpyDeviceToHost); + hcStreamSynchronize(hc_stream); + + // 写回结果 + *y = new int64_t[(*count) * ndim]; + // cudaStreamSynchronize(cuda_stream); + + // spdlog::debug("count_cuda:{}", *count_cuda); + // spdlog::debug("N:{}", N); + + hcMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + hcMemcpyDeviceToHost, hc_stream); + + // hcStreamSynchronize(hc_stream); + // for (size_t i = 0; i < (*count) * ndim; i++) { + // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); + // } + // hcFreeAsync(result, hc_stream); + // hcFreeAsync(count_cuda, hc_stream); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::metax \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_kernel.h b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h new file mode 100644 index 000000000..dd67c877c --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h @@ -0,0 +1,91 @@ +#ifndef ARGWHERE_MOORE_KERNEL_H +#define ARGWHERE_MOORE_KERNEL_H + +__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes, + size_t *pos) { + for (int i = ndim - 1; i >= 0; i--) { + pos[i] = index % shapes[i]; + index /= shapes[i]; + } +} +__device__ size_t pos2dest(size_t *pos, size_t ndim, const size_t *shapes, + const ptrdiff_t *strides) { + size_t dest = 0; + for (size_t i = 0; i < ndim; i++) { + dest += pos[i] * strides[i]; + } + return dest; +} +template +__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, + size_t N, const size_t *shapes, + const ptrdiff_t *strides, + size_t ndim, size_t *count) { + extern __shared__ size_t tmp[]; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; + + if (tid * 2 < N) { + Index2Pos(tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, shapes, strides)]) <= 1e-5; + tmp[tid * 2] = !is_zero1; + } else { + tmp[tid * 2] = 0; + } + + if (tid * 2 + 1 < N) { + Index2Pos(tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, shapes, strides)]) <= 1e-5; + tmp[tid * 2 + 1] = !is_zero2; + } else { + tmp[tid * 2 + 1] = 0; + } + + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + + for (int d = 1; d < leaf_num; d *= 2) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + if (!is_zero1 && tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid] * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid + 1] * ndim + i] = pos2[i]; + } + } + if (tid == blockDim.x - 1) { + *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]); + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.h b/src/infiniop/ops/argwhere/moore/argwhere_moore.h new file mode 100644 index 000000000..63149291a --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.h @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_MOORE_H +#define ARGWHERE_MOORE_H + +#include "../argwhere.h" + +DESCRIPTOR(moore) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu new file mode 100644 index 000000000..4d5007731 --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu @@ -0,0 +1,107 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "argwhere_kernel.h" +#include "argwhere_moore.h" +#include "infinicore.h" +#include + +// template +// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N, +// size_t M, const size_t *shapes, +// const ptrdiff_t *strides, size_t ndim, +// size_t *count) { +// parallel_block_argwhere_kernel<<<1, M / 2, M>>>( +// data, results, N, shapes, strides, ndim, count); +// } + + +infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, + size_t M, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count) { + + if (dtype == INFINI_DTYPE_F32) { + parallel_block_argwhere_kernel<<<1, M / 2, M*sizeof(size_t)>>>( + (float *)data, results, N, shapes, strides, ndim, count); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::moore { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + + sizeof(size_t); + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + musaStream_t moore_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *result = reinterpret_cast(shapes + ndim); + size_t *count_cuda = + reinterpret_cast(result + _info.num_elements * ndim); + + musaMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), musaMemcpyHostToDevice, + moore_stream); + musaMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, moore_stream); + // musaStreamSynchronize(moore_stream); + size_t M = nextPowerOfTwo(_info.num_elements); +// musaStreamSynchronize(moore_stream); + CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, + ndim, INFINI_DTYPE_F32, count_cuda)); + // 从设备内存中读取 count_cuda 的值 + musaMemcpyAsync(count, count_cuda, sizeof(size_t), musaMemcpyDeviceToHost, + moore_stream); + musaStreamSynchronize(moore_stream); + + //写回结果 + *y = new int64_t[(*count) * ndim]; + // cudaStreamSynchronize(cuda_stream); + + + musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + musaMemcpyDeviceToHost, moore_stream); + + // cudaStreamSynchronize(cuda_stream); + // for (size_t i = 0; i < (*count) * ndim; i++) { + // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); + // } + // cudaFreeAsync(result, cuda_stream); + // cudaFreeAsync(count_cuda, cuda_stream); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::moore \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu new file mode 100644 index 000000000..20e46f7bc --- /dev/null +++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu @@ -0,0 +1,107 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "argwhere_nvidia.cuh" +#include "infinicore.h" +#include +#include + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t launchKernel(const void *data, int64_t *tmp, int64_t *results, + int64_t *block_sum, size_t N, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count, + int maxThreadsPerBlock, cudaStream_t stream) { + + if (dtype == INFINI_DTYPE_F32) { + int block_size = maxThreadsPerBlock * 2; + int num_blocks = (N + block_size - 1) / block_size; + parallel_large_argwhere_kernel + <<>>( + static_cast(data), block_sum, tmp, N, shapes, + strides, ndim); + if (num_blocks > 1) { + // 计算前缀和 + parallel_block_scan_kernel + <<<1, maxThreadsPerBlock, sizeof(int64_t) * block_size>>>( + num_blocks + 1, block_sum); + // 重新整理结果 + add_block_offset_kernel<<>>( + results, tmp, block_sum, ndim); + cudaMemcpyAsync(count, &block_sum[num_blocks], sizeof(size_t), + cudaMemcpyDeviceToHost, stream); + } else { + cudaMemcpyAsync(count, &block_sum[0], sizeof(size_t), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(results, tmp, sizeof(int64_t) * (*count) * ndim, + cudaMemcpyDeviceToDevice, stream); + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::nvidia { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + Opaque *opaque = new Opaque{ + reinterpret_cast(handle)->internal()}; + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() * 2 + sizeof(int64_t) * opaque->internal->maxThreadsPerBlock(); + *desc_ptr = new Descriptor(info.take(), workspace_size, opaque, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + cudaStream_t cuda_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *tmp = reinterpret_cast(shapes + ndim); + int64_t *result = reinterpret_cast(tmp + _info.num_elements * ndim); + int64_t *block_sum = reinterpret_cast(result + _info.num_elements * ndim); + + cudaMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), cudaMemcpyHostToDevice, + cuda_stream); + cudaMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, cuda_stream); + + CHECK_STATUS(launchKernel(x, tmp, result, block_sum, _info.num_elements, + shapes, strides, ndim, INFINI_DTYPE_F32, count, + _opaque->internal->maxThreadsPerBlock(), + cuda_stream)); + // 从设备内存中读取 count_cuda 的值 + + cudaStreamSynchronize(cuda_stream); + + // 写回结果 + *y = new int64_t[(*count) * ndim]; + + cudaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + cudaMemcpyDeviceToHost, cuda_stream); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::nvidia diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh new file mode 100644 index 000000000..fdc1dd8a5 --- /dev/null +++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_NVIDIA_H +#define ARGWHERE_NVIDIA_H + +#include "../argwhere.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/operator.cc b/src/infiniop/ops/argwhere/operator.cc new file mode 100644 index 000000000..d6c5dfa7f --- /dev/null +++ b/src/infiniop/ops/argwhere/operator.cc @@ -0,0 +1,200 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/argwhere.h" + +#ifdef ENABLE_CPU_API +#include "cpu/argwhere_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/argwhere_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/argwhere_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/argwhere_kunlun.cuh" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "cambricon/argwhere_cambricon.cuh" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/argwhere_moore.h" +#endif + +__C infiniStatus_t infiniopCreateArgwhereDescriptor( + infiniopHandle_t handle, + infiniopArgwhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::argwhere::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetArgwhereWorkspaceSize( + infiniopArgwhereDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopArgwhere( + infiniopArgwhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **output, + size_t *count, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, count, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc new file mode 100644 index 000000000..c95eb3c24 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -0,0 +1,52 @@ +#include "asin_cpu.h" + +namespace op::asin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::cpu \ No newline at end of file diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h new file mode 100644 index 000000000..a07404598 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -0,0 +1,21 @@ +#ifndef __ASIN_CPU_H__ +#define __ASIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(asin, cpu) + +namespace op::asin::cpu { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::asin(input); + } +} AsinOp; +} // namespace op::asin::cpu + +#endif // __ASIN_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh new file mode 100644 index 000000000..2d4160ca6 --- /dev/null +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __ASIN_CUDA_H__ +#define __ASIN_CUDA_H__ + +namespace op::asin::cuda { +typedef struct AsinOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float asin_f32_func(float x) const { + return asinf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = asin_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = asin_f32_func(f0); + float r1 = asin_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = asin_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return asin_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::asin(input); + } else { + return std::asin(input); + } + } +} AsinOp; +} // namespace op::asin::cuda + +#endif // __ASIN_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/metax/asin_metax.h b/src/infiniop/ops/asin/metax/asin_metax.h new file mode 100644 index 000000000..e1805508d --- /dev/null +++ b/src/infiniop/ops/asin/metax/asin_metax.h @@ -0,0 +1,8 @@ +#ifndef __ASIN_METAX_API_H__ +#define __ASIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(asin, metax) + +#endif // __ASIN_METAX_API_H__ diff --git a/src/infiniop/ops/asin/metax/asin_metax.maca b/src/infiniop/ops/asin/metax/asin_metax.maca new file mode 100644 index 000000000..89567ac01 --- /dev/null +++ b/src/infiniop/ops/asin/metax/asin_metax.maca @@ -0,0 +1,61 @@ +#include "../../../elementwise/metax/elementwise_metax.h" +#include "asin_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::asin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::metax \ No newline at end of file diff --git a/src/infiniop/ops/asin/moore/asin_moore.h b/src/infiniop/ops/asin/moore/asin_moore.h new file mode 100644 index 000000000..49394ea9d --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore.h @@ -0,0 +1,8 @@ +#ifndef __ASIN_MOORE_API_H__ +#define __ASIN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(asin, moore) + +#endif // __ASIN_MOORE_API_H__ diff --git a/src/infiniop/ops/asin/moore/asin_moore.mu b/src/infiniop/ops/asin/moore/asin_moore.mu new file mode 100644 index 000000000..b5bbd0eb9 --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore.mu @@ -0,0 +1,60 @@ +#include "asin_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "asin_moore_kernel.h" + +namespace op::asin::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::AsinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::AsinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::AsinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::AsinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::moore diff --git a/src/infiniop/ops/asin/moore/asin_moore_kernel.h b/src/infiniop/ops/asin/moore/asin_moore_kernel.h new file mode 100644 index 000000000..0401417f2 --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore_kernel.h @@ -0,0 +1,42 @@ +#ifndef __ASIN_MOORE_KERNEL_H__ +#define __ASIN_MOORE_KERNEL_H__ + +namespace op::asin::moore { +typedef struct AsinOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float asin_f32_func(float x) const { + return asinf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = asin_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = asin_f32_func(f0); + float r1 = asin_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = asin_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return asin_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::asin(input); + } else { + return std::asin(input); + } + } +} AsinOp; +} // namespace op::asin::moore + +#endif // __ASIN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu new file mode 100644 index 000000000..206c277b1 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asin_nvidia.cuh" + +namespace op::asin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh new file mode 100644 index 000000000..681879341 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASIN_CUDA_API_H__ +#define __ASIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asin, nvidia) + +#endif // __ASIN_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc new file mode 100644 index 000000000..41d2e7897 --- /dev/null +++ b/src/infiniop/ops/asin/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/asin_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/asin_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAsinDescriptor( + infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsin( + infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.cc b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc new file mode 100644 index 000000000..b9afb2eb4 --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc @@ -0,0 +1,58 @@ +#include "fmin_cpu.h" + +namespace op::fmin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::cpu diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h new file mode 100644 index 000000000..f0010feea --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h @@ -0,0 +1,26 @@ +#ifndef __FMIN_CPU_H__ +#define __FMIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(fmin, cpu) + +namespace op::fmin::cpu { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v || std::is_same_v) { + float a_f = utils::cast(a); + float b_f = utils::cast(b); + float result = std::fminf(a_f, b_f); + return utils::cast(result); + } else { + return std::fmin(a, b); + } + } +} FminOp; +} // namespace op::fmin::cpu + +#endif // __FMIN_CPU_H__ diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh new file mode 100644 index 000000000..fbe385302 --- /dev/null +++ b/src/infiniop/ops/fmin/cuda/kernel.cuh @@ -0,0 +1,33 @@ +#ifndef __FMIN_CUDA_H__ +#define __FMIN_CUDA_H__ + +#include +namespace op::fmin::cuda { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { +#if defined(ENABLE_ILUVATAR_API) + if constexpr (std::is_same_v) { + float a_f = __bfloat162float(a), b_f = __bfloat162float(b); + return __float2bfloat16(fminf(a_f, b_f)); + } else if constexpr (std::is_same_v) { + float a_f = __half2float(a), b_f = __half2float(b); + return __float2half(fminf(a_f, b_f)); + } +#elif defined(ENABLE_NVIDIA_API) + if constexpr (std::is_same_v || std::is_same_v) { + return __hmin(a, b); + } +#endif + if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} FminOp; +} // namespace op::fmin::cuda + +#endif // __ADD_CUDA_H__ diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.h b/src/infiniop/ops/fmin/metax/fmin_metax.h new file mode 100644 index 000000000..9348fa599 --- /dev/null +++ b/src/infiniop/ops/fmin/metax/fmin_metax.h @@ -0,0 +1,8 @@ +#ifndef __FMIN_METAX_API_H__ +#define __FMIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(fmin, metax) + +#endif // __FMIN_METAX_API_H__ diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.maca b/src/infiniop/ops/fmin/metax/fmin_metax.maca new file mode 100644 index 000000000..049b3806c --- /dev/null +++ b/src/infiniop/ops/fmin/metax/fmin_metax.maca @@ -0,0 +1,69 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "fmin_metax.h" + +namespace op::fmin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FminOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FminOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FminOp, int32_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::FminOp, int64_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FminOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::metax diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.h b/src/infiniop/ops/fmin/moore/fmin_moore.h new file mode 100644 index 000000000..f73d99e7d --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore.h @@ -0,0 +1,8 @@ +#ifndef __FMIN_MOORE_API_H__ +#define __FMIN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(fmin, moore) + +#endif // __FMIN_MOORE_API_H__ diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.mu b/src/infiniop/ops/fmin/moore/fmin_moore.mu new file mode 100644 index 000000000..d19071e06 --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore.mu @@ -0,0 +1,66 @@ +#include "fmin_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "fmin_moore_kernel.h" + +namespace op::fmin::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::FminOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::FminOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::FminOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::FminOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, moore::FminOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, moore::FminOp, int64_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::moore diff --git a/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h new file mode 100644 index 000000000..cde58f26a --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h @@ -0,0 +1,27 @@ +#ifndef __FMIN_MOORE_KERNEL_H__ +#define __FMIN_MOORE_KERNEL_H__ + +namespace op::fmin::moore { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v) { + return __hmin(a, b); + } else if constexpr (std::is_same_v) { + float a_f = __bfloat162float(a); + float b_f = __bfloat162float(b); + return fminf(a_f, b_f); + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} FminOp; +} // namespace op::fmin::moore + +#endif // __FMIN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu new file mode 100644 index 000000000..bfe77c7bc --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu @@ -0,0 +1,66 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "fmin_nvidia.cuh" + +namespace op::fmin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FminOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FminOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FminOp, int32_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FminOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::nvidia diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh new file mode 100644 index 000000000..f9def467b --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FMIN_CUDA_API_H__ +#define __FMIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(fmin, nvidia) + +#endif // __FMIN_CUDA_API_H__ diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc new file mode 100644 index 000000000..da17139b9 --- /dev/null +++ b/src/infiniop/ops/fmin/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/fmin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/fmin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/fmin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/fmin_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/fmin_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/fmin_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/fmin_moore.h" +#endif + +__C infiniStatus_t infiniopCreateFminDescriptor( + infiniopHandle_t handle, + infiniopFminDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::fmin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFmin( + infiniopFminDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/test/infinicore/ops/adaptive_avg_pool3d.py b/test/infinicore/ops/adaptive_avg_pool3d.py index 37a0b7998..0fe1726ea 100644 --- a/test/infinicore/ops/adaptive_avg_pool3d.py +++ b/test/infinicore/ops/adaptive_avg_pool3d.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.adaptive_avg_pool3d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/addr.py b/test/infinicore/ops/addr.py index 51b172538..b1e705983 100644 --- a/test/infinicore/ops/addr.py +++ b/test/infinicore/ops/addr.py @@ -93,9 +93,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.addr(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.addr(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.addr(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/argwhere.py b/test/infinicore/ops/argwhere.py index 7f2326a29..c265925eb 100644 --- a/test/infinicore/ops/argwhere.py +++ b/test/infinicore/ops/argwhere.py @@ -52,9 +52,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.argwhere(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.argwhere(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.argwhere(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/asin.py b/test/infinicore/ops/asin.py index 1f4d70ea3..d1f227c37 100644 --- a/test/infinicore/ops/asin.py +++ b/test/infinicore/ops/asin.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.asin(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.asin(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.asin(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/fmin.py b/test/infinicore/ops/fmin.py index 49ea01447..2d3abceb9 100644 --- a/test/infinicore/ops/fmin.py +++ b/test/infinicore/ops/fmin.py @@ -96,9 +96,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.fmin(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.fmin(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.fmin(*args, **kwargs) def main(): From b978ee9907a4ed0868dcd734e9c7d83b7e5563e0 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Thu, 19 Mar 2026 01:37:13 +0000 Subject: [PATCH 2/2] issue/1031 fix T1-1-8 --- include/infinicore/ops.hpp | 4 +- .../infinicore/ops/adaptive_avg_pool3d.hpp | 2 +- include/infinicore/ops/addr.hpp | 2 +- include/infinicore/ops/argwhere.hpp | 2 +- include/infinicore/ops/asin.hpp | 2 +- include/infiniop.h | 7 +- include/infiniop/ops/adaptive_avg_pool3d.h | 10 +- include/infiniop/ops/addr.h | 38 +++--- include/infiniop/ops/argwhere.h | 10 +- include/infiniop/ops/asin.h | 26 ++-- include/infiniop/ops/fmin.h | 36 +++-- python/infinicore/__init__.py | 9 +- python/infinicore/nn/functional/__init__.py | 3 +- .../nn/functional/adaptive_avg_pool3d.py | 3 +- .../adaptive_avg_pool3d.cc | 2 +- .../adaptive_avg_pool3d_infiniop.cc | 2 +- src/infinicore/ops/addr/addr.cc | 2 +- src/infinicore/ops/addr/addr_infiniop.cc | 2 +- src/infinicore/ops/argwhere/argwhere.cc | 2 +- .../ops/argwhere/argwhere_infiniop.cc | 2 +- src/infinicore/ops/asin/asin_infiniop.cc | 2 +- src/infinicore/ops/fmin/fmin.cc | 2 +- src/infinicore/pybind11/ops.hpp | 12 +- .../pybind11/ops/adaptive_avg_pool3d.hpp | 2 +- src/infinicore/pybind11/ops/addr.hpp | 2 +- src/infinicore/pybind11/ops/argwhere.hpp | 2 +- .../adaptive_avg_pool3d/adaptive_avg_pool3d.h | 2 +- .../cpu/adaptive_avg_pool3d_cpu.cc | 121 ++++++++++------- .../cpu/adaptive_avg_pool3d_cpu.h | 2 +- .../ops/adaptive_avg_pool3d/cuda/kernel.cuh | 2 +- .../metax/adaptive_avg_pool3d_metax.h | 2 +- .../metax/adaptive_avg_pool3d_metax.maca | 94 ++++++------- .../moore/adaptive_avg_pool3d_kernel.h | 2 +- .../moore/adaptive_avg_pool3d_moore.mu | 8 +- .../nvidia/adaptive_avg_pool3d_nvidia.cu | 6 +- .../nvidia/adaptive_avg_pool3d_nvidia.cuh | 2 +- .../ops/adaptive_avg_pool3d/operator.cc | 63 ++------- src/infiniop/ops/addr/addr.h | 2 +- src/infiniop/ops/addr/cpu/addr_cpu.cc | 44 ++++-- src/infiniop/ops/addr/cpu/addr_cpu.h | 2 +- src/infiniop/ops/addr/cuda/kernel.cu | 2 +- src/infiniop/ops/addr/metax/addr_metax.h | 2 +- src/infiniop/ops/addr/metax/addr_nvidia.maca | 117 ++++++++-------- src/infiniop/ops/addr/moore/addr_kernel.h | 2 +- src/infiniop/ops/addr/moore/addr_moore.h | 2 +- src/infiniop/ops/addr/moore/addr_moore.mu | 125 +++++++++--------- src/infiniop/ops/addr/nvidia/addr_nvidia.cu | 8 +- src/infiniop/ops/addr/nvidia/addr_nvidia.cuh | 2 +- src/infiniop/ops/addr/operator.cc | 54 ++------ src/infiniop/ops/argwhere/argwhere.h | 2 +- src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc | 4 +- src/infiniop/ops/argwhere/cpu/argwhere_cpu.h | 2 +- src/infiniop/ops/argwhere/cuda/kernel.cuh | 10 +- .../ops/argwhere/metax/argwhere_metax.h | 2 +- .../ops/argwhere/metax/argwhere_metax.maca | 2 +- .../ops/argwhere/moore/argwhere_kernel.h | 2 +- .../ops/argwhere/moore/argwhere_moore.h | 2 +- .../ops/argwhere/moore/argwhere_moore.mu | 117 ++++++++-------- .../ops/argwhere/nvidia/argwhere_nvidia.cuh | 2 +- src/infiniop/ops/argwhere/operator.cc | 40 +----- src/infiniop/ops/asin/cpu/asin_cpu.cc | 2 +- src/infiniop/ops/asin/cpu/asin_cpu.h | 2 +- src/infiniop/ops/asin/cuda/kernel.cuh | 2 +- src/infiniop/ops/asin/metax/asin_metax.maca | 68 +++++----- src/infiniop/ops/asin/moore/asin_moore.mu | 2 +- src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 2 +- src/infiniop/ops/asin/nvidia/asin_nvidia.cuh | 2 +- src/infiniop/ops/asin/operator.cc | 12 +- src/infiniop/ops/fmin/cpu/fmin_cpu.h | 11 +- src/infiniop/ops/fmin/cuda/kernel.cuh | 36 ++--- src/infiniop/ops/fmin/metax/fmin_metax.maca | 78 +++++------ src/infiniop/ops/fmin/operator.cc | 40 +----- 72 files changed, 609 insertions(+), 687 deletions(-) diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index e9af0357e..18741c402 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -2,9 +2,9 @@ #include "ops/adaptive_max_pool1d.hpp" #include "ops/add.hpp" -#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/addcmul.hpp" +#include "ops/asin.hpp" #include "ops/asinh.hpp" #include "ops/atanh.hpp" #include "ops/attention.hpp" @@ -17,11 +17,11 @@ #include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" +#include "ops/fmin.hpp" #include "ops/fmod.hpp" #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" -#include "ops/fmin.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" #include "ops/paged_attention.hpp" diff --git a/include/infinicore/ops/adaptive_avg_pool3d.hpp b/include/infinicore/ops/adaptive_avg_pool3d.hpp index ab1dc9253..df6a4d2c3 100644 --- a/include/infinicore/ops/adaptive_avg_pool3d.hpp +++ b/include/infinicore/ops/adaptive_avg_pool3d.hpp @@ -13,4 +13,4 @@ class AdaptiveAvgPool3D { Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size); void adaptive_avg_pool3d_(Tensor y, Tensor x); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/addr.hpp b/include/infinicore/ops/addr.hpp index 92d97fae8..cff78eade 100644 --- a/include/infinicore/ops/addr.hpp +++ b/include/infinicore/ops/addr.hpp @@ -13,4 +13,4 @@ class Addr { Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/argwhere.hpp b/include/infinicore/ops/argwhere.hpp index f0c629adb..24f4fa65e 100644 --- a/include/infinicore/ops/argwhere.hpp +++ b/include/infinicore/ops/argwhere.hpp @@ -11,4 +11,4 @@ class Argwhere { static common::OpDispatcher &dispatcher(); }; Tensor argwhere(Tensor x); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/asin.hpp b/include/infinicore/ops/asin.hpp index e8667b037..bc5973978 100644 --- a/include/infinicore/ops/asin.hpp +++ b/include/infinicore/ops/asin.hpp @@ -15,4 +15,4 @@ class Asin { Tensor asin(Tensor input); void asin_(Tensor output, Tensor input); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index ada498304..6aaeee6a4 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,16 +2,16 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" -#include "infiniop/ops/adaptive_max_pool1d.h" #include "infiniop/ops/adaptive_avg_pool3d.h" +#include "infiniop/ops/adaptive_max_pool1d.h" #include "infiniop/ops/add.h" -#include "infiniop/ops/addr.h" #include "infiniop/ops/add_rms_norm.h" #include "infiniop/ops/addcmul.h" +#include "infiniop/ops/addr.h" #include "infiniop/ops/all.h" +#include "infiniop/ops/asin.h" #include "infiniop/ops/asinh.h" #include "infiniop/ops/atanh.h" -#include "infiniop/ops/asin.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/avg_pool1d.h" #include "infiniop/ops/binary_cross_entropy_with_logits.h" @@ -26,6 +26,7 @@ #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" #include "infiniop/ops/flash_attention.h" +#include "infiniop/ops/fmin.h" #include "infiniop/ops/fmod.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" diff --git a/include/infiniop/ops/adaptive_avg_pool3d.h b/include/infiniop/ops/adaptive_avg_pool3d.h index 178a509e1..bd23ebce8 100644 --- a/include/infiniop/ops/adaptive_avg_pool3d.h +++ b/include/infiniop/ops/adaptive_avg_pool3d.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopAdaptiveAvgPool3DDescriptor_t; -__C __export infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( infiniopHandle_t handle, infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y, infiniopTensorDescriptor_t x, size_t *output_size); -__C __export infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( +__INFINI_C __export infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( infiniopAdaptiveAvgPool3DDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAdaptiveAvgPool3D( +__INFINI_C __export infiniStatus_t infiniopAdaptiveAvgPool3D( infiniopAdaptiveAvgPool3DDescriptor_t desc, void *workspace, size_t workspace_size, @@ -25,6 +25,6 @@ __C __export infiniStatus_t infiniopAdaptiveAvgPool3D( const void *x, void *stream); -__C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/addr.h b/include/infiniop/ops/addr.h index 6cae92d97..a2f26d330 100644 --- a/include/infiniop/ops/addr.h +++ b/include/infiniop/ops/addr.h @@ -5,26 +5,26 @@ typedef struct InfiniopDescriptor *infiniopAddrDescriptor_t; -__C __export infiniStatus_t infiniopCreateAddrDescriptor(infiniopHandle_t handle, - infiniopAddrDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t out, - infiniopTensorDescriptor_t input, - infiniopTensorDescriptor_t vec1, - infiniopTensorDescriptor_t vec2, - float beta, - float alpha); +__INFINI_C __export infiniStatus_t infiniopCreateAddrDescriptor(infiniopHandle_t handle, + infiniopAddrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t vec1, + infiniopTensorDescriptor_t vec2, + float beta, + float alpha); -__C __export infiniStatus_t infiniopGetAddrWorkspaceSize(infiniopAddrDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetAddrWorkspaceSize(infiniopAddrDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAddr(infiniopAddrDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *out, - const void *input, - const void *vec1, - const void *vec2, - void *stream); +__INFINI_C __export infiniStatus_t infiniopAddr(infiniopAddrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream); -__C __export infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/argwhere.h b/include/infiniop/ops/argwhere.h index 653cbba94..b1d2f9f99 100644 --- a/include/infiniop/ops/argwhere.h +++ b/include/infiniop/ops/argwhere.h @@ -5,16 +5,16 @@ typedef struct InfiniopDescriptor *infiniopArgwhereDescriptor_t; -__C __export infiniStatus_t infiniopCreateArgwhereDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateArgwhereDescriptor( infiniopHandle_t handle, infiniopArgwhereDescriptor_t *desc_ptr, infiniopTensorDescriptor_t input_desc); -__C __export infiniStatus_t infiniopGetArgwhereWorkspaceSize( +__INFINI_C __export infiniStatus_t infiniopGetArgwhereWorkspaceSize( infiniopArgwhereDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopArgwhere( +__INFINI_C __export infiniStatus_t infiniopArgwhere( infiniopArgwhereDescriptor_t desc, void *workspace, size_t workspace_size, @@ -23,7 +23,7 @@ __C __export infiniStatus_t infiniopArgwhere( const void *input, void *stream); -__C __export infiniStatus_t infiniopDestroyArgwhereDescriptor( +__INFINI_C __export infiniStatus_t infiniopDestroyArgwhereDescriptor( infiniopArgwhereDescriptor_t desc); -#endif // __INFINIOP_ARGWHERE_API_H__ \ No newline at end of file +#endif // __INFINIOP_ARGWHERE_API_H__ diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h index ee40514d0..b4d16d481 100644 --- a/include/infiniop/ops/asin.h +++ b/include/infiniop/ops/asin.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; -__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, - infiniopAsinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); +__INFINI_C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); -__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); +__INFINI_C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); -__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/fmin.h b/include/infiniop/ops/fmin.h index 274e06a46..c1f9f75bf 100644 --- a/include/infiniop/ops/fmin.h +++ b/include/infiniop/ops/fmin.h @@ -5,24 +5,22 @@ typedef struct InfiniopDescriptor *infiniopFminDescriptor_t; -__C __export infiniStatus_t infiniopCreateFminDescriptor(infiniopHandle_t handle, - infiniopFminDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopFmin(infiniopFminDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopCreateFminDescriptor(infiniopHandle_t handle, + infiniopFminDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__INFINI_C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopFmin(infiniopFminDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc); #endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 06ee97825..0cc9c2dda 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -50,12 +50,12 @@ from infinicore.ops.add import add from infinicore.ops.add_rms_norm import add_rms_norm from infinicore.ops.addcmul import addcmul +from infinicore.ops.addr import addr from infinicore.ops.all import all +from infinicore.ops.argwhere import argwhere +from infinicore.ops.asin import asin from infinicore.ops.asinh import asinh from infinicore.ops.atanh import atanh -from infinicore.ops.addr import addr -from infinicore.ops.asin import asin -from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_ from infinicore.ops.attention import attention from infinicore.ops.baddbmm import baddbmm from infinicore.ops.bilinear import bilinear @@ -65,6 +65,7 @@ from infinicore.ops.cdist import cdist from infinicore.ops.cross_entropy import cross_entropy from infinicore.ops.equal import equal +from infinicore.ops.fmin import fmin from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching from infinicore.ops.matmul import matmul @@ -77,8 +78,6 @@ from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal -from infinicore.ops.argwhere import argwhere -from infinicore.ops.fmin import fmin from infinicore.ops.squeeze import squeeze from infinicore.ops.sum import sum from infinicore.ops.topk import topk diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 7bea371e7..f59d909b2 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,3 +1,4 @@ +from .adaptive_avg_pool3d import adaptive_avg_pool3d from .adaptive_max_pool1d import adaptive_max_pool1d from .avg_pool1d import avg_pool1d from .binary_cross_entropy_with_logits import binary_cross_entropy_with_logits @@ -14,8 +15,6 @@ from .silu import silu from .silu_and_mul import silu_and_mul from .swiglu import swiglu -from .adaptive_avg_pool3d import adaptive_avg_pool3d - __all__ = [ "adaptive_max_pool1d", diff --git a/python/infinicore/nn/functional/adaptive_avg_pool3d.py b/python/infinicore/nn/functional/adaptive_avg_pool3d.py index 9f8a3a4a6..797d8015d 100644 --- a/python/infinicore/nn/functional/adaptive_avg_pool3d.py +++ b/python/infinicore/nn/functional/adaptive_avg_pool3d.py @@ -1,6 +1,7 @@ +from typing import List + from infinicore.lib import _infinicore from infinicore.tensor import Tensor -from typing import List def adaptive_avg_pool3d(x: Tensor, output_size: List[int] = {1, 1, 1}) -> Tensor: diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc index 61d2cecfa..298a5d553 100644 --- a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc @@ -33,4 +33,4 @@ Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size) { void adaptive_avg_pool3d_(Tensor y, Tensor x) { AdaptiveAvgPool3D::execute(y, x); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc index 1f4ef3211..01341f9fd 100644 --- a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc @@ -59,4 +59,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop diff --git a/src/infinicore/ops/addr/addr.cc b/src/infinicore/ops/addr/addr.cc index af7493b6f..4691e13a8 100644 --- a/src/infinicore/ops/addr/addr.cc +++ b/src/infinicore/ops/addr/addr.cc @@ -29,4 +29,4 @@ void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float Addr::execute(out, input, vec1, vec2, beta, alpha); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/addr/addr_infiniop.cc b/src/infinicore/ops/addr/addr_infiniop.cc index a215a94bf..1c0ca3733 100644 --- a/src/infinicore/ops/addr/addr_infiniop.cc +++ b/src/infinicore/ops/addr/addr_infiniop.cc @@ -50,4 +50,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::addr_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::addr_impl::infiniop diff --git a/src/infinicore/ops/argwhere/argwhere.cc b/src/infinicore/ops/argwhere/argwhere.cc index a9d45de48..6e1f83615 100644 --- a/src/infinicore/ops/argwhere/argwhere.cc +++ b/src/infinicore/ops/argwhere/argwhere.cc @@ -30,4 +30,4 @@ Tensor argwhere(Tensor x) { return result; } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/argwhere/argwhere_infiniop.cc b/src/infinicore/ops/argwhere/argwhere_infiniop.cc index bc6c919db..7e04767bf 100644 --- a/src/infinicore/ops/argwhere/argwhere_infiniop.cc +++ b/src/infinicore/ops/argwhere/argwhere_infiniop.cc @@ -49,4 +49,4 @@ static bool registered = []() { Argwhere::dispatcher().registerAll(&calculate, false); return true; }(); -} // namespace infinicore::op::argwhere_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::argwhere_impl::infiniop diff --git a/src/infinicore/ops/asin/asin_infiniop.cc b/src/infinicore/ops/asin/asin_infiniop.cc index 4dec4c220..a794a4682 100644 --- a/src/infinicore/ops/asin/asin_infiniop.cc +++ b/src/infinicore/ops/asin/asin_infiniop.cc @@ -49,4 +49,4 @@ static bool registered = []() { return true; }(); -} // namespace infinicore::op::asin_impl::infiniop \ No newline at end of file +} // namespace infinicore::op::asin_impl::infiniop diff --git a/src/infinicore/ops/fmin/fmin.cc b/src/infinicore/ops/fmin/fmin.cc index 54d056c63..ce6a92bb8 100644 --- a/src/infinicore/ops/fmin/fmin.cc +++ b/src/infinicore/ops/fmin/fmin.cc @@ -24,4 +24,4 @@ void fmin_(Tensor c, Tensor a, Tensor b) { Fmin::execute(c, a, b); } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index ec5fca334..c950f90e5 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -2,15 +2,15 @@ #include -#include "ops/adaptive_max_pool1d.hpp" #include "ops/adaptive_avg_pool3d.hpp" +#include "ops/adaptive_max_pool1d.hpp" #include "ops/add.hpp" -#include "ops/addr.hpp" -#include "ops/argwhere.hpp" -#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/addcmul.hpp" +#include "ops/addr.hpp" #include "ops/all.hpp" +#include "ops/argwhere.hpp" +#include "ops/asin.hpp" #include "ops/asinh.hpp" #include "ops/atanh.hpp" #include "ops/attention.hpp" @@ -24,11 +24,11 @@ #include "ops/embedding.hpp" #include "ops/equal.hpp" #include "ops/flash_attention.hpp" +#include "ops/fmin.hpp" #include "ops/fmod.hpp" #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" -#include "ops/fmin.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" #include "ops/matmul.hpp" @@ -107,4 +107,4 @@ inline void bind(py::module &m) { bind_reciprocal(m); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp index 3b2b8fdc7..758baf232 100644 --- a/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp +++ b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp @@ -19,4 +19,4 @@ inline void bind_adaptive_avg_pool3d(py::module &m) { py::arg("x"), R"doc(In-place, Adaptive Average Pooling 3D.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/addr.hpp b/src/infinicore/pybind11/ops/addr.hpp index 2777fc513..912b2b5da 100644 --- a/src/infinicore/pybind11/ops/addr.hpp +++ b/src/infinicore/pybind11/ops/addr.hpp @@ -26,4 +26,4 @@ inline void bind_addr(py::module_ &m) { py::arg("alpha"), R"doc(Addr.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/argwhere.hpp b/src/infinicore/pybind11/ops/argwhere.hpp index 0e6e4a4af..cb91142c8 100644 --- a/src/infinicore/pybind11/ops/argwhere.hpp +++ b/src/infinicore/pybind11/ops/argwhere.hpp @@ -10,4 +10,4 @@ inline void bind_argwhere(py::module &m) { py::arg("x"), R"doc(Argwhere.)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h index feea6e701..33280aad9 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h +++ b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h @@ -87,4 +87,4 @@ class AdaptiveAvgPool3DInfo { } }; -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc index 45b53a973..6006e8ea6 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc @@ -31,60 +31,83 @@ infiniStatus_t calculateAdaptiveAvgPool3D( const AdaptiveAvgPool3DInfo &info, Tdata *y, const Tdata *x) { + std::array y_strides; - y_strides[size_t(4)] = 1; - y_strides[size_t(3)] = info.y_w * y_strides[size_t(4)]; - y_strides[size_t(2)] = info.y_h * y_strides[size_t(3)]; - y_strides[size_t(1)] = info.y_d * y_strides[size_t(2)]; - y_strides[size_t(0)] = info.C * y_strides[size_t(1)]; - { -#pragma omp for collapse(5) - for (ptrdiff_t n = 0; n < ptrdiff_t(info.N); n++) { - for (ptrdiff_t c = 0; c < ptrdiff_t(info.C); c++) { - for (ptrdiff_t od = 0; od < ptrdiff_t(info.y_d); od++) { - for (ptrdiff_t oh = 0; oh < ptrdiff_t(info.y_h); oh++) { - for (ptrdiff_t ow = 0; ow < ptrdiff_t(info.y_w); ow++) { - size_t x_start_d = od * info.x_d / info.y_d; - size_t x_end_d = ((od + 1) * info.x_d + info.y_d - 1) / info.y_d; - size_t x_start_h = oh * info.x_h / info.y_h; - size_t x_end_h = ((oh + 1) * info.x_h + info.y_h - 1) / info.y_h; - size_t x_start_w = ow * info.x_w / info.y_w; - size_t x_end_w = ((ow + 1) * info.x_w + info.y_w - 1) / info.y_w; - size_t count = (x_end_d - x_start_d) * (x_end_h - x_start_h) * (x_end_w - x_start_w); - - // Handle floating point types with casting - if constexpr (std::is_same::value || std::is_same::value) { - float sum = 0.0f; - for (size_t id = x_start_d; id < x_end_d; id++) { - for (size_t ih = x_start_h; ih < x_end_h; ih++) { - for (size_t iw = x_start_w; iw < x_end_w; iw++) { - size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; - sum += utils::cast(x[x_offset]); - } - } - } - size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; - y[y_offset] = utils::cast(sum / static_cast(count)); - } else { - Tdata sum = (Tdata)0; - for (size_t id = x_start_d; id < x_end_d; id++) { - for (size_t ih = x_start_h; ih < x_end_h; ih++) { - for (size_t iw = x_start_w; iw < x_end_w; iw++) { - size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; - sum += x[x_offset]; - } - } - } - size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; - // For integer types, we might want to handle division differently - y[y_offset] = sum / static_cast(count); - } - } + y_strides[4] = 1; + y_strides[3] = info.y_w * y_strides[4]; + y_strides[2] = info.y_h * y_strides[3]; + y_strides[1] = info.y_d * y_strides[2]; + y_strides[0] = info.C * y_strides[1]; + + const size_t total = info.N * info.C * info.y_d * info.y_h * info.y_w; + +#pragma omp parallel for + for (ptrdiff_t idx = 0; idx < (ptrdiff_t)total; idx++) { + + // 🔹 Decode flattened index → (n, c, od, oh, ow) + ptrdiff_t tmp = idx; + + ptrdiff_t ow = tmp % info.y_w; + tmp /= info.y_w; + ptrdiff_t oh = tmp % info.y_h; + tmp /= info.y_h; + ptrdiff_t od = tmp % info.y_d; + tmp /= info.y_d; + ptrdiff_t c = tmp % info.C; + tmp /= info.C; + ptrdiff_t n = tmp; + + // 🔹 Compute pooling region + size_t x_start_d = od * info.x_d / info.y_d; + size_t x_end_d = ((od + 1) * info.x_d + info.y_d - 1) / info.y_d; + + size_t x_start_h = oh * info.x_h / info.y_h; + size_t x_end_h = ((oh + 1) * info.x_h + info.y_h - 1) / info.y_h; + + size_t x_start_w = ow * info.x_w / info.y_w; + size_t x_end_w = ((ow + 1) * info.x_w + info.y_w - 1) / info.y_w; + + size_t count = (x_end_d - x_start_d) * (x_end_h - x_start_h) * (x_end_w - x_start_w); + + size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + + // 🔹 Accumulate + if constexpr (std::is_same::value || std::is_same::value) { + + float sum = 0.0f; + + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + + sum += utils::cast(x[x_offset]); } } } + + y[y_offset] = utils::cast(sum / (float)count); + + } else { + + Tdata sum = (Tdata)0; + + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + + sum += x[x_offset]; + } + } + } + + y[y_offset] = sum / (Tdata)count; } } + return INFINI_STATUS_SUCCESS; } @@ -110,4 +133,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::adaptive_avg_pool3d::cpu \ No newline at end of file +} // namespace op::adaptive_avg_pool3d::cpu diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h index 94753979a..f20cfcc77 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h @@ -5,4 +5,4 @@ DESCRIPTOR(cpu) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh index a348c5e30..6493ea786 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh +++ b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh @@ -41,4 +41,4 @@ __device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, y[y_index] = sum / static_cast(static_cast(count)); } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h index 9421aef35..9a1c04384 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca index 48794fe17..5e49830a2 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca @@ -8,14 +8,14 @@ INFINIOP_METAX_KERNEL adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, size_t x_w, size_t y_d, size_t y_h, size_t y_w, const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { - adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, - y_w, x_strides, y_strides); + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); } namespace op::adaptive_avg_pool3d::metax { struct Descriptor::Opaque { - std::shared_ptr internal; + std::shared_ptr internal; }; Descriptor::~Descriptor() { delete _opaque; } @@ -25,14 +25,14 @@ infiniStatus_t Descriptor::create(infiniopHandle_t handle, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t *output_size) { - auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); - CHECK_RESULT(info); - size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides - *desc_ptr = new Descriptor( - info.take(), workspace_size, - new Opaque{reinterpret_cast(handle)->internal()}, - handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; } template @@ -41,48 +41,48 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, size_t x_w, size_t y_d, size_t y_h, size_t y_w, const ptrdiff_t *x_strides, const ptrdiff_t *y_strides, hcStream_t stream) { - size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; - if (dtype == INFINI_DTYPE_F16) { - adaptiveAvgPool3D<<>>( - (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, - x_strides, y_strides); - } else if (dtype == INFINI_DTYPE_F32) { - adaptiveAvgPool3D<<>>( - (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, - x_strides, y_strides); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, void *y, const void *x, void *stream_) const { - hcStream_t stream = (hcStream_t)stream_; + hcStream_t stream = (hcStream_t)stream_; - // Prepare strides - ptrdiff_t *x_strides = (ptrdiff_t *)workspace; - ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); - hcMemcpyAsync(x_strides, _info.x_strides.data(), - _info.x_strides.size() * sizeof(ptrdiff_t), - hcMemcpyHostToDevice, stream); - hcMemcpyAsync(y_strides, _info.y_strides.data(), - _info.y_strides.size() * sizeof(ptrdiff_t), - hcMemcpyHostToDevice, stream); + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + hcMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); + hcMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); - if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, - _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); - } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { - CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, - _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; } -} // namespace op::adaptive_avg_pool3d::metax \ No newline at end of file +} // namespace op::adaptive_avg_pool3d::metax diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h index a99c02483..f4eccade2 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h @@ -39,4 +39,4 @@ __device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, y[y_index] = sum / static_cast(count); } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu index 9dd2259bf..23f7f0dfa 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu @@ -1,7 +1,7 @@ #include "../../../devices/moore/moore_common.h" #include "../../../devices/moore/moore_kernel_common.h" -#include "adaptive_avg_pool3d_moore.h" #include "adaptive_avg_pool3d_kernel.h" +#include "adaptive_avg_pool3d_moore.h" template INFINIOP_MOORE_KERNEL @@ -77,13 +77,13 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); - } else if(_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048){ + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); - }else{ + } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::adaptive_avg_pool3d::moore diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu index b522b2da3..b6f3738b1 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu @@ -81,6 +81,10 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, @@ -90,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, } return INFINI_STATUS_SUCCESS; } -} // namespace op::adaptive_avg_pool3d::nvidia \ No newline at end of file +} // namespace op::adaptive_avg_pool3d::nvidia diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh index 973a0a54a..09d2f5cc8 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh @@ -5,4 +5,4 @@ DESCRIPTOR(nvidia) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/adaptive_avg_pool3d/operator.cc b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc index 738402862..0b2036a6a 100644 --- a/src/infiniop/ops/adaptive_avg_pool3d/operator.cc +++ b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc @@ -9,23 +9,14 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/adaptive_avg_pool3d_nvidia.cuh" #endif -#ifdef ENABLE_ASCEND_API -#include "ascend/adaptive_avg_pool3d_ascend.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/adaptive_avg_pool3d_bang.h" -#endif #ifdef ENABLE_METAX_API #include "metax/adaptive_avg_pool3d_metax.h" #endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/adaptive_avg_pool3d_kunlun.h" -#endif #ifdef ENABLE_MOORE_API #include "moore/adaptive_avg_pool3d_moore.h" #endif -__C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( infiniopHandle_t handle, infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y, @@ -63,15 +54,8 @@ __C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef CREATE @@ -79,7 +63,7 @@ __C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( +__INFINI_C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( infiniopAdaptiveAvgPool3DDescriptor_t desc, size_t *size) { @@ -110,15 +94,8 @@ __C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef GET @@ -126,7 +103,7 @@ __C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAdaptiveAvgPool3D( +__INFINI_C infiniStatus_t infiniopAdaptiveAvgPool3D( infiniopAdaptiveAvgPool3DDescriptor_t desc, void *workspace, size_t workspace_size, @@ -159,21 +136,14 @@ __C infiniStatus_t infiniopAdaptiveAvgPool3D( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef CALCULATE return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ @@ -201,16 +171,9 @@ __C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAv #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - DELETE(INFINI_DEVICE_ASCEND, ascend); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef DELETE return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} \ No newline at end of file +} diff --git a/src/infiniop/ops/addr/addr.h b/src/infiniop/ops/addr/addr.h index 6cf5b20fc..426bdd6b3 100644 --- a/src/infiniop/ops/addr/addr.h +++ b/src/infiniop/ops/addr/addr.h @@ -92,4 +92,4 @@ struct AddrInfo { } }; -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc index 52339164c..a2c4526cf 100644 --- a/src/infiniop/ops/addr/cpu/addr_cpu.cc +++ b/src/infiniop/ops/addr/cpu/addr_cpu.cc @@ -18,20 +18,38 @@ infiniStatus_t addr_impl(Tdata *out, float beta = info.beta; float alpha = info.alpha; -#pragma omp parallel for collapse(2) - for (size_t i = 0; i < n; ++i) { - for (size_t j = 0; j < m; ++j) { - if constexpr (std::is_same::value || std::is_same::value) { - float a = utils::cast(vec1[i * info.vec1_stride]); - float b = utils::cast(vec2[j * info.vec2_stride]); - float c = utils::cast(input[i * info.input_stride0 + j * info.input_stride1]); - out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); - } else { - float a = vec1[i * info.vec1_stride], b = vec2[j * info.vec2_stride], c = input[i * info.input_stride0 + j * info.input_stride1]; - out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); - } + size_t total = n * m; + +#pragma omp parallel for + for (ptrdiff_t idx = 0; idx < (ptrdiff_t)total; ++idx) { + + // 🔹 Decode (i, j) + size_t i = idx / m; + size_t j = idx % m; + + size_t v1_idx = i * info.vec1_stride; + size_t v2_idx = j * info.vec2_stride; + size_t in_idx = i * info.input_stride0 + j * info.input_stride1; + size_t out_idx = i * info.output_stride0 + j * info.output_stride1; + + if constexpr (std::is_same::value || std::is_same::value) { + + float a = utils::cast(vec1[v1_idx]); + float b = utils::cast(vec2[v2_idx]); + float c = utils::cast(input[in_idx]); + + out[out_idx] = utils::cast(alpha * a * b + beta * c); + + } else { + + float a = (float)vec1[v1_idx]; + float b = (float)vec2[v2_idx]; + float c = (float)input[in_idx]; + + out[out_idx] = utils::cast(alpha * a * b + beta * c); } } + return INFINI_STATUS_SUCCESS; } @@ -90,4 +108,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::addr::cpu \ No newline at end of file +} // namespace op::addr::cpu diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.h b/src/infiniop/ops/addr/cpu/addr_cpu.h index e606c1816..e2da1026a 100644 --- a/src/infiniop/ops/addr/cpu/addr_cpu.h +++ b/src/infiniop/ops/addr/cpu/addr_cpu.h @@ -4,4 +4,4 @@ DESCRIPTOR(cpu) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/cuda/kernel.cu b/src/infiniop/ops/addr/cuda/kernel.cu index b6af54e05..e920c372f 100644 --- a/src/infiniop/ops/addr/cuda/kernel.cu +++ b/src/infiniop/ops/addr/cuda/kernel.cu @@ -35,4 +35,4 @@ __device__ void addr_kernel(T *out, const T *input, const T *vec1, out[out_idx] = out_val; __syncthreads(); } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/metax/addr_metax.h b/src/infiniop/ops/addr/metax/addr_metax.h index 9a8acdbe2..3c7dad51b 100644 --- a/src/infiniop/ops/addr/metax/addr_metax.h +++ b/src/infiniop/ops/addr/metax/addr_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/metax/addr_nvidia.maca b/src/infiniop/ops/addr/metax/addr_nvidia.maca index 443d59cff..1187c81e2 100644 --- a/src/infiniop/ops/addr/metax/addr_nvidia.maca +++ b/src/infiniop/ops/addr/metax/addr_nvidia.maca @@ -12,14 +12,14 @@ INFINIOP_METAX_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, float alpha, ptrdiff_t stride1, ptrdiff_t stride2, ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { - addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, - stride2, out_stride_0, out_stride_1, in_stride_0, - in_stride_1); + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); } namespace op::addr::metax { struct Descriptor::Opaque { - std::shared_ptr internal; + std::shared_ptr internal; }; Descriptor::~Descriptor() { delete _opaque; } @@ -32,16 +32,15 @@ infiniStatus_t Descriptor::create(infiniopHandle_t handle, infiniopTensorDescriptor_t vec2_desc, float beta, float alpha) { - auto info = - AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); - CHECK_RESULT(info); - size_t workspace_size = 0; + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; - *desc_ptr = new Descriptor( - info.take(), workspace_size, - new Opaque{reinterpret_cast(handle)->internal()}, - handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; } template @@ -51,58 +50,58 @@ launchKernel(infiniDtype_t dtype, void *output, const void *input, float alpha, ptrdiff_t stride1, ptrdiff_t stride2, ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, hcStream_t stream) { - unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), - dm = std::min((unsigned int)m, BLOCK_SIZE); - dim3 grid = {(unsigned int)((n + dn - 1) / dn), - (unsigned int)((m + dm - 1) / dm)}; - dim3 block = {dn, dm}; - switch (dtype) { - case INFINI_DTYPE_F32: - Addr<<>>( - (float *)output, (const float *)input, (const float *)vec1, - (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, - out_stride_1, in_stride_0, in_stride_1); - break; - case INFINI_DTYPE_F16: - Addr<__half><<>>( - (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, - beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, - in_stride_1); - break; - case INFINI_DTYPE_BF16: - Addr<__hpcc_bfloat16><<>>( - (__hpcc_bfloat16 *)output, (__hpcc_bfloat16 *)input, - (__hpcc_bfloat16 *)vec1, (__hpcc_bfloat16 *)vec2, n, m, beta, alpha, - stride1, stride2, out_stride_0, out_stride_1, in_stride_0, in_stride_1); - break; - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<__hpcc_bfloat16><<>>( + (__hpcc_bfloat16 *)output, (__hpcc_bfloat16 *)input, + (__hpcc_bfloat16 *)vec1, (__hpcc_bfloat16 *)vec2, n, m, beta, alpha, + stride1, stride2, out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, void *out, const void *input, const void *vec1, const void *vec2, void *stream_) const { - hcStream_t stream = (hcStream_t)stream_; + hcStream_t stream = (hcStream_t)stream_; - if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel( - _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, - _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, - _info.output_stride0, _info.output_stride1, _info.input_stride0, - _info.input_stride1, stream)); - } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { - CHECK_STATUS(launchKernel( - _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, - _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, - _info.output_stride0, _info.output_stride1, _info.input_stride0, - _info.input_stride1, stream)); - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; } -} // namespace op::addr::metax \ No newline at end of file +} // namespace op::addr::metax diff --git a/src/infiniop/ops/addr/moore/addr_kernel.h b/src/infiniop/ops/addr/moore/addr_kernel.h index 49e83aced..dcc2f7d49 100644 --- a/src/infiniop/ops/addr/moore/addr_kernel.h +++ b/src/infiniop/ops/addr/moore/addr_kernel.h @@ -33,4 +33,4 @@ __device__ void addr_kernel(T *out, const T *input, const T *vec1, out[out_idx] = out_val; __syncthreads(); } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/moore/addr_moore.h b/src/infiniop/ops/addr/moore/addr_moore.h index d32182597..e090e6182 100644 --- a/src/infiniop/ops/addr/moore/addr_moore.h +++ b/src/infiniop/ops/addr/moore/addr_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/moore/addr_moore.mu b/src/infiniop/ops/addr/moore/addr_moore.mu index 95626044b..b00037033 100644 --- a/src/infiniop/ops/addr/moore/addr_moore.mu +++ b/src/infiniop/ops/addr/moore/addr_moore.mu @@ -7,13 +7,13 @@ template INFINIOP_MOORE_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, - const Tdata *vec2, size_t n, size_t m, float beta, - float alpha, ptrdiff_t stride1, ptrdiff_t stride2, - ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, - ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { - addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, - stride2, out_stride_0, out_stride_1, in_stride_0, - in_stride_1); + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); } namespace op::addr::moore { @@ -31,17 +31,16 @@ infiniStatus_t Descriptor::create(infiniopHandle_t handle, infiniopTensorDescriptor_t vec2_desc, float beta, float alpha) { - auto info = - AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); - CHECK_RESULT(info); - size_t workspace_size = 0; + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; - *desc_ptr = new Descriptor( - info.take(), workspace_size, - new Opaque{ - reinterpret_cast(handle)->internal()}, - handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; } template @@ -52,57 +51,57 @@ infiniStatus_t launchKernel(infiniDtype_t dtype, void *output, ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, musaStream_t stream) { - unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), - dm = std::min((unsigned int)m, BLOCK_SIZE); - dim3 grid = {(unsigned int)((n + dn - 1) / dn), - (unsigned int)((m + dm - 1) / dm)}; - dim3 block = {dn, dm}; - switch (dtype) { - case INFINI_DTYPE_F32: - Addr<<>>( - (float *)output, (const float *)input, (const float *)vec1, - (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, - out_stride_1, in_stride_0, in_stride_1); - break; - case INFINI_DTYPE_F16: - Addr<__half><<>>( - (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, - beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, - in_stride_1); - break; - case INFINI_DTYPE_BF16: - Addr<<>>( - (cuda_bfloat16 *)output, (cuda_bfloat16 *)input, (cuda_bfloat16 *)vec1, - (cuda_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, - out_stride_0, out_stride_1, in_stride_0, in_stride_1); - break; - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<<>>( + (cuda_bfloat16 *)output, (cuda_bfloat16 *)input, (cuda_bfloat16 *)vec1, + (cuda_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, + out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, void *out, const void *input, const void *vec1, const void *vec2, void *stream_) const { - musaStream_t stream = (musaStream_t)stream_; + musaStream_t stream = (musaStream_t)stream_; - if (_opaque->internal->maxThreadsPerBlock() ==MOORE_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel( - _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, - _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, - _info.output_stride0, _info.output_stride1, _info.input_stride0, - _info.input_stride1, stream)); - } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { - CHECK_STATUS(launchKernel( - _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, - _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, - _info.output_stride0, _info.output_stride1, _info.input_stride0, - _info.input_stride1, stream)); - } else { - return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; - } - return INFINI_STATUS_SUCCESS; + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; } -} // namespace op::addr::moore \ No newline at end of file +} // namespace op::addr::moore diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cu b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu index cbd5279ce..00deeab48 100644 --- a/src/infiniop/ops/addr/nvidia/addr_nvidia.cu +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu @@ -99,6 +99,12 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, _info.output_stride0, _info.output_stride1, _info.input_stride0, _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, @@ -111,4 +117,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, return INFINI_STATUS_SUCCESS; } -} // namespace op::addr::nvidia \ No newline at end of file +} // namespace op::addr::nvidia diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh index 9ebe7f2c3..151da410b 100644 --- a/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh @@ -4,4 +4,4 @@ DESCRIPTOR(nvidia) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/addr/operator.cc b/src/infiniop/ops/addr/operator.cc index 6e3724743..1b5101352 100644 --- a/src/infiniop/ops/addr/operator.cc +++ b/src/infiniop/ops/addr/operator.cc @@ -16,7 +16,7 @@ #include "moore/addr_moore.h" #endif -__C infiniStatus_t infiniopCreateAddrDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAddrDescriptor( infiniopHandle_t handle, infiniopAddrDescriptor_t *desc_ptr, infiniopTensorDescriptor_t out, @@ -60,15 +60,8 @@ __C infiniStatus_t infiniopCreateAddrDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef CREATE @@ -76,7 +69,7 @@ __C infiniStatus_t infiniopCreateAddrDescriptor( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopGetAddrWorkspaceSize( +__INFINI_C infiniStatus_t infiniopGetAddrWorkspaceSize( infiniopAddrDescriptor_t desc, size_t *size) { @@ -107,15 +100,8 @@ __C infiniStatus_t infiniopGetAddrWorkspaceSize( #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef GET @@ -123,7 +109,7 @@ __C infiniStatus_t infiniopGetAddrWorkspaceSize( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAddr( +__INFINI_C infiniStatus_t infiniopAddr( infiniopAddrDescriptor_t desc, void *workspace, size_t workspace_size, @@ -160,15 +146,8 @@ __C infiniStatus_t infiniopAddr( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef CALCULATE @@ -176,7 +155,7 @@ __C infiniStatus_t infiniopAddr( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ case CASE: \ @@ -205,18 +184,11 @@ __C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_ASCEND_API - DELETE(INFINI_DEVICE_ASCEND, ascend); -#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef DELETE return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} \ No newline at end of file +} diff --git a/src/infiniop/ops/argwhere/argwhere.h b/src/infiniop/ops/argwhere/argwhere.h index afb067576..09fc872f4 100644 --- a/src/infiniop/ops/argwhere/argwhere.h +++ b/src/infiniop/ops/argwhere/argwhere.h @@ -72,4 +72,4 @@ class ArgwhereInfo { x_desc->numel()}); } }; -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc index 6a92047a2..95d4b700b 100644 --- a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc @@ -43,7 +43,7 @@ infiniStatus_t calculateArgWhere( for (size_t i = 0; i < info.num_elements; i++) { size_t pos = 0, tem = i; std::vector position(info.strides.size()); - for (int j = info.strides.size() - 1; j >= 0; j--) { + for (size_t j = info.strides.size() - 1; j >= 0; j--) { position[j] = tem % info.shapes[j]; tem /= info.shapes[j]; pos += position[j] * info.strides[j]; @@ -77,4 +77,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::argwhere::cpu \ No newline at end of file +} // namespace op::argwhere::cpu diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h index 1e6461e75..46d828fde 100644 --- a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h @@ -5,4 +5,4 @@ DESCRIPTOR(cpu) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/cuda/kernel.cuh b/src/infiniop/ops/argwhere/cuda/kernel.cuh index ff4aae126..e562cf88e 100644 --- a/src/infiniop/ops/argwhere/cuda/kernel.cuh +++ b/src/infiniop/ops/argwhere/cuda/kernel.cuh @@ -21,7 +21,9 @@ __device__ size_t pos2dest(size_t *pos, size_t ndim, const ptrdiff_t *strides) { template __global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, size_t N, const size_t *shapes, const ptrdiff_t *strides, size_t ndim, size_t *count) { - extern __shared__ size_t tmp[]; + extern __shared__ char smem[]; + size_t *tmp = reinterpret_cast(smem); + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 bool is_zero1 = false, is_zero2 = false; int tid = threadIdx.x; @@ -90,7 +92,9 @@ __global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, template __global__ void parallel_block_scan_kernel(size_t N, int64_t *pre_sum) { // single block scan - extern __shared__ int64_t tmp[]; + extern __shared__ char smem[]; + int64_t *tmp = reinterpret_cast(smem); + int tid = threadIdx.x; int leaf_num = blockDim.x * 2; // equals to length of tmp @@ -212,4 +216,4 @@ __global__ void add_block_offset_kernel(int64_t *results, int64_t *tmp, } } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.h b/src/infiniop/ops/argwhere/metax/argwhere_metax.h index 4e85f870c..8335f413b 100644 --- a/src/infiniop/ops/argwhere/metax/argwhere_metax.h +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.h @@ -5,4 +5,4 @@ DESCRIPTOR(metax) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.maca b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca index 7f3c5fb4a..41e621c43 100644 --- a/src/infiniop/ops/argwhere/metax/argwhere_metax.maca +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca @@ -91,4 +91,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, return INFINI_STATUS_SUCCESS; } -} // namespace op::argwhere::metax \ No newline at end of file +} // namespace op::argwhere::metax diff --git a/src/infiniop/ops/argwhere/moore/argwhere_kernel.h b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h index dd67c877c..e32727e9a 100644 --- a/src/infiniop/ops/argwhere/moore/argwhere_kernel.h +++ b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h @@ -88,4 +88,4 @@ __global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.h b/src/infiniop/ops/argwhere/moore/argwhere_moore.h index 63149291a..903915893 100644 --- a/src/infiniop/ops/argwhere/moore/argwhere_moore.h +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.h @@ -5,4 +5,4 @@ DESCRIPTOR(moore) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu index 4d5007731..59c7b9b88 100644 --- a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu @@ -14,24 +14,23 @@ // data, results, N, shapes, strides, ndim, count); // } - infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, size_t M, const size_t *shapes, const ptrdiff_t *strides, size_t ndim, infiniDtype_t dtype, size_t *count) { - if (dtype == INFINI_DTYPE_F32) { - parallel_block_argwhere_kernel<<<1, M / 2, M*sizeof(size_t)>>>( - (float *)data, results, N, shapes, strides, ndim, count); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; + if (dtype == INFINI_DTYPE_F32) { + parallel_block_argwhere_kernel<<<1, M / 2, M * sizeof(size_t)>>>( + (float *)data, results, N, shapes, strides, ndim, count); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; } namespace op::argwhere::moore { struct Descriptor::Opaque { - std::shared_ptr internal; + std::shared_ptr internal; }; Descriptor::~Descriptor() = default; @@ -39,69 +38,65 @@ Descriptor::~Descriptor() = default; infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc) { - auto info = ArgwhereInfo::create(x_desc); - CHECK_RESULT(info); - size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + - x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + - sizeof(size_t); - *desc_ptr = new Descriptor( - info.take(), workspace_size, - new Opaque{ - reinterpret_cast(handle)->internal()}, - handle->device, handle->device_id); - return INFINI_STATUS_SUCCESS; + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + sizeof(size_t); + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; } int nextPowerOfTwo(int x) { - int power = 1; - while (power < x) { - power *= 2; - } - return power; + int power = 1; + while (power < x) { + power *= 2; + } + return power; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, void **y, size_t *count, const void *x, void *stream) const { - musaStream_t moore_stream = static_cast(stream); - size_t ndim = _info.strides.size(); - ptrdiff_t *strides = static_cast(workspace); - size_t *shapes = reinterpret_cast(strides + ndim); - int64_t *result = reinterpret_cast(shapes + ndim); - size_t *count_cuda = - reinterpret_cast(result + _info.num_elements * ndim); - - musaMemcpyAsync(shapes, _info.shapes.data(), - _info.shapes.size() * sizeof(size_t), musaMemcpyHostToDevice, - moore_stream); - musaMemcpyAsync(strides, _info.strides.data(), - _info.strides.size() * sizeof(ptrdiff_t), - musaMemcpyHostToDevice, moore_stream); - // musaStreamSynchronize(moore_stream); - size_t M = nextPowerOfTwo(_info.num_elements); -// musaStreamSynchronize(moore_stream); - CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, - ndim, INFINI_DTYPE_F32, count_cuda)); - // 从设备内存中读取 count_cuda 的值 - musaMemcpyAsync(count, count_cuda, sizeof(size_t), musaMemcpyDeviceToHost, - moore_stream); - musaStreamSynchronize(moore_stream); + musaStream_t moore_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *result = reinterpret_cast(shapes + ndim); + size_t *count_cuda = reinterpret_cast(result + _info.num_elements * ndim); - //写回结果 - *y = new int64_t[(*count) * ndim]; - // cudaStreamSynchronize(cuda_stream); + musaMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), musaMemcpyHostToDevice, + moore_stream); + musaMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, moore_stream); + // musaStreamSynchronize(moore_stream); + size_t M = nextPowerOfTwo(_info.num_elements); + // musaStreamSynchronize(moore_stream); + CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, + ndim, INFINI_DTYPE_F32, count_cuda)); + // 从设备内存中读取 count_cuda 的值 + musaMemcpyAsync(count, count_cuda, sizeof(size_t), musaMemcpyDeviceToHost, + moore_stream); + musaStreamSynchronize(moore_stream); + // 写回结果 + *y = new int64_t[(*count) * ndim]; + // cudaStreamSynchronize(cuda_stream); - musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, - musaMemcpyDeviceToHost, moore_stream); + musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + musaMemcpyDeviceToHost, moore_stream); - // cudaStreamSynchronize(cuda_stream); - // for (size_t i = 0; i < (*count) * ndim; i++) { - // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); - // } - // cudaFreeAsync(result, cuda_stream); - // cudaFreeAsync(count_cuda, cuda_stream); - return INFINI_STATUS_SUCCESS; + // cudaStreamSynchronize(cuda_stream); + // for (size_t i = 0; i < (*count) * ndim; i++) { + // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); + // } + // cudaFreeAsync(result, cuda_stream); + // cudaFreeAsync(count_cuda, cuda_stream); + return INFINI_STATUS_SUCCESS; } -} // namespace op::argwhere::moore \ No newline at end of file +} // namespace op::argwhere::moore diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh index fdc1dd8a5..b4e369ad2 100644 --- a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh +++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh @@ -5,4 +5,4 @@ DESCRIPTOR(nvidia) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/argwhere/operator.cc b/src/infiniop/ops/argwhere/operator.cc index d6c5dfa7f..8b5393aa0 100644 --- a/src/infiniop/ops/argwhere/operator.cc +++ b/src/infiniop/ops/argwhere/operator.cc @@ -11,17 +11,11 @@ #ifdef ENABLE_METAX_API #include "metax/argwhere_metax.h" #endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/argwhere_kunlun.cuh" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "cambricon/argwhere_cambricon.cuh" -#endif #ifdef ENABLE_MOORE_API #include "moore/argwhere_moore.h" #endif -__C infiniStatus_t infiniopCreateArgwhereDescriptor( +__INFINI_C infiniStatus_t infiniopCreateArgwhereDescriptor( infiniopHandle_t handle, infiniopArgwhereDescriptor_t *desc_ptr, infiniopTensorDescriptor_t input_desc) { @@ -50,12 +44,6 @@ __C infiniStatus_t infiniopCreateArgwhereDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); #endif @@ -67,7 +55,7 @@ __C infiniStatus_t infiniopCreateArgwhereDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetArgwhereWorkspaceSize( +__INFINI_C infiniStatus_t infiniopGetArgwhereWorkspaceSize( infiniopArgwhereDescriptor_t desc, size_t *size) { @@ -92,12 +80,6 @@ __C infiniStatus_t infiniopGetArgwhereWorkspaceSize( #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); #endif @@ -109,7 +91,7 @@ __C infiniStatus_t infiniopGetArgwhereWorkspaceSize( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopArgwhere( +__INFINI_C infiniStatus_t infiniopArgwhere( infiniopArgwhereDescriptor_t desc, void *workspace, size_t workspace_size, @@ -140,12 +122,6 @@ __C infiniStatus_t infiniopArgwhere( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); #endif @@ -157,7 +133,7 @@ __C infiniStatus_t infiniopArgwhere( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ @@ -182,12 +158,6 @@ infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) { #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); #endif @@ -197,4 +167,4 @@ infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) { } #undef DELETE -} \ No newline at end of file +} diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc index c95eb3c24..48a05d76c 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.cc +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -49,4 +49,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::asin::cpu \ No newline at end of file +} // namespace op::asin::cpu diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h index a07404598..409f15779 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.h +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -18,4 +18,4 @@ typedef struct AsinOp { } AsinOp; } // namespace op::asin::cpu -#endif // __ASIN_CPU_H__ \ No newline at end of file +#endif // __ASIN_CPU_H__ diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh index 2d4160ca6..00802692b 100644 --- a/src/infiniop/ops/asin/cuda/kernel.cuh +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -39,4 +39,4 @@ typedef struct AsinOp { } AsinOp; } // namespace op::asin::cuda -#endif // __ASIN_CUDA_H__ \ No newline at end of file +#endif // __ASIN_CUDA_H__ diff --git a/src/infiniop/ops/asin/metax/asin_metax.maca b/src/infiniop/ops/asin/metax/asin_metax.maca index 89567ac01..b09ebd936 100644 --- a/src/infiniop/ops/asin/metax/asin_metax.maca +++ b/src/infiniop/ops/asin/metax/asin_metax.maca @@ -12,22 +12,22 @@ Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, std::vector input_desc_vec) { - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, - INFINI_DTYPE_BF16); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); - CHECK_SAME_SHAPE(output_shape, input_shape); + CHECK_SAME_SHAPE(output_shape, input_shape); - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, @@ -35,27 +35,27 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, std::vector inputs, void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AsinOp, half>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AsinOp, float>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::AsinOp, double>( - _info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; } -} // namespace op::asin::metax \ No newline at end of file +} // namespace op::asin::metax diff --git a/src/infiniop/ops/asin/moore/asin_moore.mu b/src/infiniop/ops/asin/moore/asin_moore.mu index b5bbd0eb9..7e2150e6c 100644 --- a/src/infiniop/ops/asin/moore/asin_moore.mu +++ b/src/infiniop/ops/asin/moore/asin_moore.mu @@ -14,7 +14,7 @@ infiniStatus_t Descriptor::create( infiniopTensorDescriptor_t out_desc, std::vector input_desc_vec) { - auto handle = reinterpret_cast(handle_); + auto handle = reinterpret_cast(handle_); auto dtype = out_desc->dtype(); const auto &input_desc = input_desc_vec.at(0); diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu index 206c277b1..6def60963 100644 --- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -58,4 +58,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, return INFINI_STATUS_SUCCESS; } -} // namespace op::asin::nvidia \ No newline at end of file +} // namespace op::asin::nvidia diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh index 681879341..624676428 100644 --- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh @@ -5,4 +5,4 @@ ELEMENTWISE_DESCRIPTOR(asin, nvidia) -#endif // __ASIN_CUDA_API_H__ \ No newline at end of file +#endif // __ASIN_CUDA_API_H__ diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc index 41d2e7897..3616819ef 100644 --- a/src/infiniop/ops/asin/operator.cc +++ b/src/infiniop/ops/asin/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/asin_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) #include "nvidia/asin_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -15,7 +15,7 @@ #include "moore/asin_moore.h" #endif -__C infiniStatus_t infiniopCreateAsinDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAsinDescriptor( infiniopHandle_t handle, infiniopAsinDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, @@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateAsinDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, s return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAsin( +__INFINI_C infiniStatus_t infiniopAsin( infiniopAsinDescriptor_t desc, void *workspace, size_t workspace_size, @@ -123,7 +123,7 @@ __C infiniStatus_t infiniopAsin( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ @@ -154,4 +154,4 @@ infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { } #undef DELETE -} \ No newline at end of file +} diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h index f0010feea..e7619c5ba 100644 --- a/src/infiniop/ops/fmin/cpu/fmin_cpu.h +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h @@ -12,12 +12,13 @@ typedef struct FminOp { template T operator()(const T &a, const T &b) const { if constexpr (std::is_same_v || std::is_same_v) { - float a_f = utils::cast(a); - float b_f = utils::cast(b); - float result = std::fminf(a_f, b_f); - return utils::cast(result); - } else { + return utils::cast(std::fminf( + utils::cast(a), + utils::cast(b))); + } else if constexpr (std::is_floating_point_v) { return std::fmin(a, b); + } else { + return std::min(a, b); } } } FminOp; diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh index fbe385302..2915488fc 100644 --- a/src/infiniop/ops/fmin/cuda/kernel.cuh +++ b/src/infiniop/ops/fmin/cuda/kernel.cuh @@ -5,28 +5,28 @@ namespace op::fmin::cuda { typedef struct FminOp { public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { #if defined(ENABLE_ILUVATAR_API) - if constexpr (std::is_same_v) { - float a_f = __bfloat162float(a), b_f = __bfloat162float(b); - return __float2bfloat16(fminf(a_f, b_f)); - } else if constexpr (std::is_same_v) { - float a_f = __half2float(a), b_f = __half2float(b); - return __float2half(fminf(a_f, b_f)); - } + if constexpr (std::is_same_v) { + float a_f = __bfloat162float(a), b_f = __bfloat162float(b); + return __float2bfloat16(fminf(a_f, b_f)); + } else if constexpr (std::is_same_v) { + float a_f = __half2float(a), b_f = __half2float(b); + return __float2half(fminf(a_f, b_f)); + } #elif defined(ENABLE_NVIDIA_API) - if constexpr (std::is_same_v || std::is_same_v) { - return __hmin(a, b); - } + if constexpr (std::is_same_v || std::is_same_v) { + return __hmin(a, b); + } #endif - if constexpr (std::is_same_v) { - return fminf(a, b); - } else { - return a < b ? a : b; + if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } } - } } FminOp; } // namespace op::fmin::cuda diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.maca b/src/infiniop/ops/fmin/metax/fmin_metax.maca index 049b3806c..cebe4178a 100644 --- a/src/infiniop/ops/fmin/metax/fmin_metax.maca +++ b/src/infiniop/ops/fmin/metax/fmin_metax.maca @@ -12,24 +12,24 @@ Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, std::vector input_desc_vec) { - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, - INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, @@ -37,33 +37,33 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, std::vector inputs, void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::FminOp, half>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::FminOp, float>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_I32: - return _device_info->calculate<256, cuda::FminOp, int32_t>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_I64: - return _device_info->calculate<256, cuda::FminOp, int64_t>( - _info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::FminOp, double>( - _info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FminOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FminOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FminOp, int32_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::FminOp, int64_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FminOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS; } } // namespace op::fmin::metax diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc index da17139b9..392052e69 100644 --- a/src/infiniop/ops/fmin/operator.cc +++ b/src/infiniop/ops/fmin/operator.cc @@ -11,17 +11,11 @@ #ifdef ENABLE_METAX_API #include "metax/fmin_metax.h" #endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/fmin_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/fmin_bang.h" -#endif #ifdef ENABLE_MOORE_API #include "moore/fmin_moore.h" #endif -__C infiniStatus_t infiniopCreateFminDescriptor( +__INFINI_C infiniStatus_t infiniopCreateFminDescriptor( infiniopHandle_t handle, infiniopFminDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, @@ -54,12 +48,6 @@ __C infiniStatus_t infiniopCreateFminDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); #endif @@ -71,7 +59,7 @@ __C infiniStatus_t infiniopCreateFminDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -94,12 +82,6 @@ __C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, s #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); #endif @@ -111,7 +93,7 @@ __C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, s return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopFmin( +__INFINI_C infiniStatus_t infiniopFmin( infiniopFminDescriptor_t desc, void *workspace, size_t workspace_size, @@ -142,12 +124,6 @@ __C infiniStatus_t infiniopFmin( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); #endif @@ -159,7 +135,7 @@ __C infiniStatus_t infiniopFmin( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ @@ -184,12 +160,6 @@ infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) { #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif #ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); #endif @@ -199,4 +169,4 @@ infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) { } #undef DELETE -} \ No newline at end of file +}