From a199afe4f0392eacdb96a136ecefbf05712a4e8b Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Sun, 30 Nov 2025 18:32:11 +0800 Subject: [PATCH 01/12] feature: Added operator support to match torch.cat --- include/infinicore/ops/cat.hpp | 9 +++ python/infinicore/__init__.py | 2 + python/infinicore/ops/cat.py | 12 ++++ src/infinicore/ops/cat/cat.cc | 104 ++++++++++++++++++++++++++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/cat.hpp | 26 +++++++ test/infinicore/ops/cat.py | 6 +- 7 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 include/infinicore/ops/cat.hpp create mode 100644 python/infinicore/ops/cat.py create mode 100644 src/infinicore/ops/cat/cat.cc create mode 100644 src/infinicore/pybind11/ops/cat.hpp diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp new file mode 100644 index 000000000..aa7dafd6c --- /dev/null +++ b/include/infinicore/ops/cat.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "common/op.hpp" + +namespace infinicore::op { + +Tensor cat(std::vector tensors, int dim); +void cat_(std::vector tensors, int dim, Tensor out); +} // namespace infinicore::op \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 06294bf3e..cdd04a0a8 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -40,6 +40,7 @@ ) from infinicore.ops.add import add from infinicore.ops.attention import attention +from infinicore.ops.cat import cat from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow @@ -99,6 +100,7 @@ # Operations. "add", "attention", + "cat", "matmul", "mul", "narrow", diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py new file mode 100644 index 000000000..a51e23654 --- /dev/null +++ b/python/infinicore/ops/cat.py @@ -0,0 +1,12 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def cat(tensors, dim:int=0, out=None): + print(type(tensors)) + if out is None: + return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) + + _infinicore.cat_([tensor._underlying for tensor in tensors], dim, out._underlying) + + return out diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc new file mode 100644 index 000000000..d98a5e16d --- /dev/null +++ b/src/infinicore/ops/cat/cat.cc @@ -0,0 +1,104 @@ +#include "infinicore/ops/cat.hpp" +#include "infinicore/context/context.hpp" +#include + +namespace infinicore::op { + +namespace { + + void low_dim_copy(Tensor& tensor, int dim, Tensor& out, std::byte* tensor_ptr, std::byte* out_ptr, int depth) { + if (depth != out->ndim() - 1) { + std::byte* now_tensor_ptr = tensor_ptr; + std::byte* now_out_ptr = out_ptr; + for (int i = 0; i < tensor->shape()[depth]; i ++) { + low_dim_copy(tensor, dim, out, now_tensor_ptr, now_out_ptr, depth + 1); + now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + Size data_size = tensor->shape()[depth] * dsize(out->dtype()); + if (out->device().getType() == Device::Type::CPU) + std::memcpy(out_ptr, tensor_ptr, data_size); + else + context::memcpyD2D(out_ptr, tensor_ptr, data_size); + } + } + + void high_dim_split(std::vector& tensors, int dim, Tensor& out, std::vector tensors_ptr, std::byte* out_ptr, int depth) { + if (depth != dim) { + std::vector now_tensors_ptr = tensors_ptr; + std::byte* now_out_ptr = out_ptr; + for (int i = 0; i < out->shape()[depth]; i ++) { + high_dim_split(tensors, dim, out, now_tensors_ptr, now_out_ptr, depth + 1); + for (int i = 0; i < tensors.size(); i ++) { + if (tensors[i]->ndim() == 1) continue; + now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); + } + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + std::byte* now_out_ptr = out_ptr; + for (int i = 0; i < tensors.size(); i ++) { + if (tensors[i]->ndim() == 1) continue; + low_dim_copy(tensors[i], dim, out, tensors_ptr[i], now_out_ptr, depth); + now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); + } + } + } + +} // namespace + +Tensor cat(std::vector tensors, int dim) { + assert(tensors.size() >= 2); + int ndim = tensors[0]->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + Shape shape = tensors[0]->shape(); + for (int i = 1; i < tensors.size(); i ++) { + assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1); + if (tensors[i]->ndim() != ndim) continue; + shape[dim] += tensors[i]->shape()[dim]; + } + + auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device()); + cat_(tensors, dim, out); + return out; +} + +void cat_(std::vector tensors, int dim, Tensor out) { + assert(tensors.size() >= 2); + int ndim = out->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + + Size dim_shape = 0; + for (auto& tensor : tensors) { + assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1); + if (tensor->ndim() == 1) { + assert(tensor->shape()[0] == 0); + continue; + } + for (int i = 0; i < ndim; i ++) { + if (i != dim) { + assert(tensor->shape()[i] == out->shape()[i]); + } else { + dim_shape += tensor->shape()[i]; + } + } + } + assert(dim_shape == out->shape()[dim]); + + std::vector tensors_ptr(tensors.size()); + + for (int i = 0; i < tensors.size(); i ++) { + tensors_ptr[i] = tensors[i]->data(); + } + std::byte* out_ptr = out->data(); + + high_dim_split(tensors, dim, out, tensors_ptr, out_ptr, 0); + +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..dee2e88f0 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -4,6 +4,7 @@ #include "ops/add.hpp" #include "ops/attention.hpp" +#include "ops/cat.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" @@ -23,6 +24,7 @@ namespace infinicore::ops { inline void bind(py::module &m) { bind_add(m); bind_attention(m); + bind_cat(m); bind_causal_softmax(m); bind_random_sample(m); bind_linear(m); diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp new file mode 100644 index 000000000..8dd06bd88 --- /dev/null +++ b/src/infinicore/pybind11/ops/cat.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/cat.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cat(py::module &m) { + + m.def("cat", + &op::cat, + py::arg("tensors"), + py::arg("dim") = 0, + R"doc(opertor: torch.cat, out-of-place mode)doc"); + + m.def("cat_", + &op::cat_, + py::arg("tensors"), + py::arg("dim") = 0, + py::arg("out"), + R"doc(opertor: torch.cat, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index 669fbfd1b..e9b85f2e5 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -126,9 +126,9 @@ def torch_operator(self, *args, **kwargs): """PyTorch cat implementation""" return torch.cat(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore cat implementation""" - # return infinicore.cat(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore cat implementation""" + return infinicore.cat(*args, **kwargs) def main(): From 49dfcdf2f321c028d9e285079cbe24d74c3de2b0 Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Wed, 3 Dec 2025 22:04:52 +0800 Subject: [PATCH 02/12] feature: Added operator support to match torch.masked_select on device CPU, nvidia --- include/infinicore/ops/masked_select.hpp | 16 ++ include/infiniop.h | 1 + include/infiniop/ops/masked_select.h | 28 +++ python/infinicore/__init__.py | 2 + python/infinicore/ops/masked_select.py | 5 + .../ops/masked_select/masked_select.cc | 33 +++ .../masked_select/masked_select_infiniop.cc | 59 +++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/masked_select.hpp | 20 ++ .../masked_select/cpu/masked_select_cpu.cc | 56 +++++ .../ops/masked_select/cpu/masked_select_cpu.h | 7 + .../ops/masked_select/cuda/kernel.cuh | 102 +++++++++ src/infiniop/ops/masked_select/info.h | 53 +++++ .../ops/masked_select/masked_select.h | 48 ++++ .../nvidia/masked_select_nvidia.cu | 154 +++++++++++++ .../nvidia/masked_select_nvidia.cuh | 8 + src/infiniop/ops/masked_select/operator.cc | 207 ++++++++++++++++++ test/infinicore/ops/masked_select.py | 7 +- 18 files changed, 805 insertions(+), 3 deletions(-) create mode 100644 include/infinicore/ops/masked_select.hpp create mode 100644 include/infiniop/ops/masked_select.h create mode 100644 python/infinicore/ops/masked_select.py create mode 100644 src/infinicore/ops/masked_select/masked_select.cc create mode 100644 src/infinicore/ops/masked_select/masked_select_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/masked_select.hpp create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.h create mode 100644 src/infiniop/ops/masked_select/cuda/kernel.cuh create mode 100644 src/infiniop/ops/masked_select/info.h create mode 100644 src/infiniop/ops/masked_select/masked_select.h create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh create mode 100644 src/infiniop/ops/masked_select/operator.cc diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp new file mode 100644 index 000000000..89bbfa49d --- /dev/null +++ b/include/infinicore/ops/masked_select.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class MaskedSelect { +public: + using schema = void (*)(Tensor, Tensor, void **, size_t *); + static void execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr); + static common::OpDispatcher &dispatcher(); +}; + +Tensor masked_select(Tensor input, Tensor mask); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..21f6064c9 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -13,6 +13,7 @@ #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/masked_select.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/random_sample.h" diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h new file mode 100644 index 000000000..7adf65415 --- /dev/null +++ b/include/infiniop/ops/masked_select.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_MASKED_SELECT_API_H__ +#define __INFINIOP_MASKED_SELECT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc); + +__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, + size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index cdd04a0a8..2a09da310 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -41,6 +41,7 @@ from infinicore.ops.add import add from infinicore.ops.attention import attention from infinicore.ops.cat import cat +from infinicore.ops.masked_select import masked_select from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow @@ -101,6 +102,7 @@ "add", "attention", "cat", + "masked_select", "matmul", "mul", "narrow", diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py new file mode 100644 index 000000000..d1846edfc --- /dev/null +++ b/python/infinicore/ops/masked_select.py @@ -0,0 +1,5 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def masked_select(input, mask): + return Tensor(_infinicore.masked_select(input._underlying, mask._underlying)) \ No newline at end of file diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc new file mode 100644 index 000000000..13ac6dfe4 --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select.cc @@ -0,0 +1,33 @@ +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &MaskedSelect::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(input, mask, data_ptr, dlen_ptr); +} + +Tensor masked_select(Tensor input, Tensor mask) { + + std::byte *data; + size_t dlen; + MaskedSelect::execute(input, mask, (void **)&data, &dlen); + + auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device()); + + return out; +} + +} diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc new file mode 100644 index 000000000..352ef0b8f --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/masked_select.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::masked_select_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopMaskedSelectDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyMaskedSelectDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + size_t seed = hash_combine(input, mask, (std::uintptr_t)data_ptr, (std::uintptr_t)dlen_ptr); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopMaskedSelectDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateMaskedSelectDescriptor( + context::getInfiniopHandle(input->device()), &desc, + input->desc(), mask->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopMaskedSelect( + desc, workspace->data(), workspace_size, + input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream())); +} + +static bool registered = []() { + MaskedSelect::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA + // Device::Type::METAX, + // Device::Type::MOORE, + // Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index dee2e88f0..e906ff009 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -8,6 +8,7 @@ #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" +#include "ops/masked_select.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" #include "ops/random_sample.hpp" @@ -28,6 +29,7 @@ inline void bind(py::module &m) { bind_causal_softmax(m); bind_random_sample(m); bind_linear(m); + bind_masked_select(m); bind_matmul(m); bind_mul(m); bind_rearrange(m); diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp new file mode 100644 index 000000000..2734590e6 --- /dev/null +++ b/src/infinicore/pybind11/ops/masked_select.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "infinicore/ops/masked_select.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_masked_select(py::module &m) { + + m.def("masked_select", + &op::masked_select, + py::arg("input"), + py::arg("mask"), + R"doc(opertor: torch.masked_select, out-of-place mode)doc"); + +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc new file mode 100644 index 000000000..af5a74683 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc @@ -0,0 +1,56 @@ +#include "masked_select_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::masked_select::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + template + infiniStatus_t maskedSelect(const MaskedSelectInfo *info, T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { + std::vector res; + for (size_t index = 0; index < info->total_elements; index ++) { + size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); + size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); + if (mask[mask_offset]) { + res.push_back(input[input_offset]); + } + } + *dlen_ptr = res.size(); + *data_ptr = new T[*dlen_ptr]; + std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr)); + + return INFINI_STATUS_SUCCESS; + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(maskedSelect(&_info, (float*)input, (bool*)mask, data_ptr, dlen_ptr)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h new file mode 100644 index 000000000..9fed9e2e6 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_CPU_H__ +#define __MASKED_SELECT_CPU_H__ +#include "../masked_select.h" + +DESCRIPTOR(cpu) + +#endif // __MASKED_SELECT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh new file mode 100644 index 000000000..bd4e4c655 --- /dev/null +++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh @@ -0,0 +1,102 @@ +#ifndef __MASKED_SELECT_KERNEL_CUH__ +#define __MASKED_SELECT_KERNEL_CUH__ +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +template +INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h new file mode 100644 index 000000000..889b32821 --- /dev/null +++ b/src/infiniop/ops/masked_select/info.h @@ -0,0 +1,53 @@ +#ifndef __MASKED_SELECT__INFO_H__ +#define __MASKED_SELECT__INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::masked_select { + +class MaskedSelectInfo { + MaskedSelectInfo() = default; + +public: + infiniDtype_t dtype; + size_t ndim; + size_t total_elements; + + std::vector shape; + + std::vector input_strides; + std::vector mask_strides; + + static utils::Result create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL); + + auto shape = input_desc->shape(); + CHECK_SAME_SHAPE(shape, mask_desc->shape()); + + auto ndim = input_desc->ndim(); + + size_t total_elements = 1; + for (auto& dim : shape) { + total_elements *= dim; + } + + auto input_strides = input_desc->strides(); + auto mask_strides = mask_desc->strides(); + + return utils::Result(MaskedSelectInfo{ + dtype, + ndim, + total_elements, + shape, + input_strides, + mask_strides}); + } +}; + +} // namespace op::masked_select + +#endif // __MASKED_SELECT_INFO_H__ diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h new file mode 100644 index 000000000..98148ebaf --- /dev/null +++ b/src/infiniop/ops/masked_select/masked_select.h @@ -0,0 +1,48 @@ +#ifndef MASKED_SELECT_H +#define MASKED_SELECT_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::masked_select::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MaskedSelectInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + MaskedSelectInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t mask_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + const void *input, \ + const bool *mask, \ + void **data_ptr, \ + size_t *dlen_ptr, \ + void *stream) const; \ + }; \ + } + +#endif // MASKED_SELECT_H \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu new file mode 100644 index 000000000..8dab98b41 --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -0,0 +1,154 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "masked_select_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::masked_select::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_CUDA(cudaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_CUDA(cudaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); + + CHECK_CUDA(cudaMallocAsync(data_ptr, *dlen_ptr * sizeof(T), stream)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_cuda, input_strides_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh new file mode 100644 index 000000000..bc493edbe --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MASKED_SELECT_NVIDIA_H__ +#define __MASKED_SELECT_NVIDIA_H__ + +#include "../masked_select.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc new file mode 100644 index 000000000..bdd40972a --- /dev/null +++ b/src/infiniop/ops/masked_select/operator.cc @@ -0,0 +1,207 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/masked_select.h" + +#ifdef ENABLE_CPU_API +#include "cpu/masked_select_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/masked_select_nvidia.cuh" +#endif +// #ifdef ENABLE_METAX_API +// #include "metax/masked_select_metax.h" +// #endif +// #ifdef ENABLE_ASCEND_API +// #include "ascend/masked_select_ascend.h" +// #endif +// #ifdef ENABLE_CAMBRICON_API +// #include "bang/masked_select_bang.h" +// #endif +// #ifdef ENABLE_KUNLUN_API +// #include "kunlun/masked_select_kunlun.h" +// #endif +// #ifdef ENABLE_MOORE_API +// #include "moore/masked_select_moore.h" +// #endif + +__C infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::masked_select::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr),\ + input_desc, \ + mask_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// CREATE(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// CREATE(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// CREATE(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// CREATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// CREATE(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// CREATE(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// GET(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// GET(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// GET(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// GET(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_API +// GET(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_KUNLUN_API +// GET(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// GET(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_size, + size_t *dlen, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, input, mask, data_size, dlen, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// CALCULATE(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// CALCULATE(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// CALCULATE(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// CALCULATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// CALCULATE(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// DESTROY(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// DESTROY(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// DESTROY(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// DESTROY(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// DESTROY(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// DESTROY(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// DESTROY(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py index 711a0cef0..94ca8d02c 100644 --- a/test/infinicore/ops/masked_select.py +++ b/test/infinicore/ops/masked_select.py @@ -15,6 +15,7 @@ ((1, 6), None, (1, 6)), ((4, 4), None, (4, 4)), ((2, 3, 2), None, (2, 3, 2)), + ((100, 100, 100), None, (100, 100, 100)) ] _TOLERANCE_MAP = {infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}} @@ -53,9 +54,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.masked_select(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.masked_select(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.masked_select(*args, **kwargs) def main(): From ffcecd23096d93f689ac9f114a73bdb9d27dbec7 Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Fri, 5 Dec 2025 22:40:13 +0800 Subject: [PATCH 03/12] feature: Added operator support to match torch.inner on device CPU, nvidia --- include/infinicore/ops/inner.hpp | 17 ++ include/infiniop.h | 1 + include/infiniop/ops/inner.h | 28 +++ python/infinicore/__init__.py | 2 + python/infinicore/ops/cat.py | 1 - python/infinicore/ops/inner.py | 11 + src/infinicore/ops/cat/cat.cc | 4 +- src/infinicore/ops/inner/inner.cc | 46 ++++ src/infinicore/ops/inner/inner_infiniop.cc | 59 +++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/inner.hpp | 26 +++ src/infiniop/ops/inner/cpu/inner_cpu.cc | 88 ++++++++ src/infiniop/ops/inner/cpu/inner_cpu.h | 7 + src/infiniop/ops/inner/cuda/kernel.cuh | 42 ++++ src/infiniop/ops/inner/info.h | 88 ++++++++ src/infiniop/ops/inner/inner.h | 49 +++++ src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 128 +++++++++++ .../ops/inner/nvidia/inner_nvidia.cuh | 7 + src/infiniop/ops/inner/operator.cc | 208 ++++++++++++++++++ .../masked_select/cpu/masked_select_cpu.cc | 32 +-- .../nvidia/masked_select_nvidia.cu | 2 +- test/infinicore/ops/inner.py | 6 +- test/infinicore/ops/masked_select.py | 1 - 23 files changed, 832 insertions(+), 23 deletions(-) create mode 100644 include/infinicore/ops/inner.hpp create mode 100644 include/infiniop/ops/inner.h create mode 100644 python/infinicore/ops/inner.py create mode 100644 src/infinicore/ops/inner/inner.cc create mode 100644 src/infinicore/ops/inner/inner_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/inner.hpp create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.cc create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.h create mode 100644 src/infiniop/ops/inner/cuda/kernel.cuh create mode 100644 src/infiniop/ops/inner/info.h create mode 100644 src/infiniop/ops/inner/inner.h create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cu create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cuh create mode 100644 src/infiniop/ops/inner/operator.cc diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp new file mode 100644 index 000000000..87bdd2919 --- /dev/null +++ b/include/infinicore/ops/inner.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Inner { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor input, Tensor other, Tensor out); + static common::OpDispatcher &dispatcher(); +}; + +Tensor inner(Tensor input, Tensor other); +void inner_(Tensor input, Tensor other, Tensor out); + +} \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index 21f6064c9..f394dbc28 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -10,6 +10,7 @@ #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/inner.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h new file mode 100644 index 000000000..14dff5a8a --- /dev/null +++ b/include/infiniop/ops/inner.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_INNER_API_H__ +#define __INFINIOP_INNER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t; + +__C __export infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc, + infiniopTensorDescriptor_t out_desc); + +__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, + size_t workspace_size, + const void *input, + const void *other, + void *out, + void *stream); + +__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 2a09da310..86b2e43ff 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -41,6 +41,7 @@ from infinicore.ops.add import add from infinicore.ops.attention import attention from infinicore.ops.cat import cat +from infinicore.ops.inner import inner from infinicore.ops.masked_select import masked_select from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul @@ -102,6 +103,7 @@ "add", "attention", "cat", + "inner", "masked_select", "matmul", "mul", diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py index a51e23654..5fc7536a9 100644 --- a/python/infinicore/ops/cat.py +++ b/python/infinicore/ops/cat.py @@ -3,7 +3,6 @@ def cat(tensors, dim:int=0, out=None): - print(type(tensors)) if out is None: return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py new file mode 100644 index 000000000..09d456177 --- /dev/null +++ b/python/infinicore/ops/inner.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def inner(input, other, out=None): + if out is None: + return Tensor(_infinicore.inner(input._underlying, other._underlying)) + + _infinicore.inner_(input._underlying, other._underlying, out._underlying) + + return out diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc index d98a5e16d..29daa94b3 100644 --- a/src/infinicore/ops/cat/cat.cc +++ b/src/infinicore/ops/cat/cat.cc @@ -7,7 +7,7 @@ namespace infinicore::op { namespace { void low_dim_copy(Tensor& tensor, int dim, Tensor& out, std::byte* tensor_ptr, std::byte* out_ptr, int depth) { - if (depth != out->ndim() - 1) { + if (depth != out->ndim()) { std::byte* now_tensor_ptr = tensor_ptr; std::byte* now_out_ptr = out_ptr; for (int i = 0; i < tensor->shape()[depth]; i ++) { @@ -16,7 +16,7 @@ namespace { now_out_ptr += out->stride(depth) * dsize(out->dtype()); } } else { - Size data_size = tensor->shape()[depth] * dsize(out->dtype()); + Size data_size = dsize(out->dtype()); if (out->device().getType() == Device::Type::CPU) std::memcpy(out_ptr, tensor_ptr, data_size); else diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc new file mode 100644 index 000000000..606de26a4 --- /dev/null +++ b/src/infinicore/ops/inner/inner.cc @@ -0,0 +1,46 @@ +#include +#include + +namespace infinicore::op { + + +common::OpDispatcher &Inner::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Inner::execute(Tensor input, Tensor other, Tensor out) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(input, other, out); +} + +Tensor inner(Tensor input, Tensor other) { + size_t input_ndim = input->ndim(); + size_t other_ndim = other->ndim(); + + assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]); + + Shape out_shape; + for (int i = 0; i < input_ndim - 1; i ++) + out_shape.push_back(input->shape()[i]); + for (int i = 0; i < other_ndim - 1; i ++) + out_shape.push_back(other->shape()[i]); + auto out = Tensor::zeros(out_shape, input->dtype(), input->device()); + + inner_(input, other, out); + return out; +} + +void inner_(Tensor input, Tensor other, Tensor out) { + + Inner::execute(input, other, out); + +} + +} \ No newline at end of file diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc new file mode 100644 index 000000000..45cb228bc --- /dev/null +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/inner.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::inner_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopInnerDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyInnerDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor input, Tensor other, Tensor out) { + size_t seed = hash_combine(input, other, out); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopInnerDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor( + context::getInfiniopHandle(input->device()), &desc, + input->desc(), other->desc(), out->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopInner( + desc, workspace->data(), workspace_size, + input->data(), other->data(), out->data(), context::getStream())); +} + +static bool registered = []() { + Inner::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA + // Device::Type::METAX, + // Device::Type::MOORE, + // Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index e906ff009..1dea7b249 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,6 +7,7 @@ #include "ops/cat.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" +#include "ops/inner.hpp" #include "ops/linear.hpp" #include "ops/masked_select.hpp" #include "ops/matmul.hpp" @@ -27,6 +28,7 @@ inline void bind(py::module &m) { bind_attention(m); bind_cat(m); bind_causal_softmax(m); + bind_inner(m); bind_random_sample(m); bind_linear(m); bind_masked_select(m); diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp new file mode 100644 index 000000000..8a81e59ee --- /dev/null +++ b/src/infinicore/pybind11/ops/inner.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/inner.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_inner(py::module &m) { + + m.def("inner", + &op::inner, + py::arg("input"), + py::arg("other"), + R"doc(opertor: torch.inner, out-of-place mode)doc"); + + m.def("inner_", + &op::inner_, + py::arg("input"), + py::arg("other"), + py::arg("out"), + R"doc(opertor: torch.inner, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc new file mode 100644 index 000000000..4c35036ea --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc @@ -0,0 +1,88 @@ +#include "inner_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::inner::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc, + infiniopTensorDescriptor_t out_desc) { + + auto result = InnerInfo::create(input_desc, other_desc, out_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) { +#pragma omp parallel for + for (size_t out_index = 0; out_index < info->total_elements; out_index ++) { + + size_t out_dim_pos = info->out_ndim - 1; + size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data()); + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)info->other_ndim - 2; i >= 0; i --) { + other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + for (int i = (int)info->input_ndim - 2; i >= 0; i --) { + input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < info->oper_len; i ++) { + if constexpr (std::is_same::value || std::is_same::value) { + tmp += utils::cast(input[input_offset]) * utils::cast(other[other_offset]); + } else { + tmp += input[input_offset] * other[other_offset]; + } + input_offset += info->input_strides[info->input_ndim - 1]; + other_offset += info->other_strides[info->other_ndim - 1]; + } + + if constexpr (std::is_same::value || std::is_same::value) { + out[out_offset] = utils::cast(tmp); + } else { + out[out_offset] = tmp; + } + } + + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void* workspace, size_t workspace_size, + const void *input, + const void *other, + void *out, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F32){ + CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h new file mode 100644 index 000000000..5d4774dfa --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.h @@ -0,0 +1,7 @@ +#ifndef __INNER_CPU_H__ +#define __INNER_CPU_H__ +#include "../inner.h" + +DESCRIPTOR(cpu); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh new file mode 100644 index 000000000..66661db98 --- /dev/null +++ b/src/infiniop/ops/inner/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __INNER_KERNEL_CUH__ +#define __INNER_KERNEL_CUH__ +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +template +INFINIOP_CUDA_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + T tmp = 0; + for (size_t i = 0; i < oper_len; i ++) { + tmp += input[input_offset] * other[other_offset]; + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h new file mode 100644 index 000000000..f69348b83 --- /dev/null +++ b/src/infiniop/ops/inner/info.h @@ -0,0 +1,88 @@ +#ifndef __INNER_INFO_H__ +#define __INNER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::inner { + +class InnerInfo { + InnerInfo() = default; + +public: + infiniDtype_t dtype; + + size_t input_ndim; + size_t other_ndim; + size_t out_ndim; + + size_t oper_len; + size_t total_elements; + + std::vector out_shape; + + std::vector input_strides; + std::vector other_strides; + std::vector out_strides; + + static utils::Result create( + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc, + infiniopTensorDescriptor_t out_desc) { + + auto dtype = out_desc->dtype(); + if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) + return INFINI_STATUS_BAD_TENSOR_DTYPE; + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + auto input_ndim = input_desc->ndim(); + auto other_ndim = other_desc->ndim(); + auto out_ndim = out_desc->ndim(); + if (out_ndim + 2 != input_ndim + other_ndim) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + + auto input_shape = input_desc->shape(); + auto other_shape = other_desc->shape(); + auto out_shape = out_desc->shape(); + if (input_ndim && other_ndim) { + if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t out_dim_pos = 0; + if (input_ndim) { + for (size_t i = 0; i < input_ndim - 1; i ++) + if (input_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (other_ndim) { + for (size_t i = 0; i < other_ndim - 1; i ++) + if (other_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t oper_len = 1; + if (input_ndim && other_ndim) { + oper_len = input_shape[input_ndim - 1]; + } + + size_t total_elements = 1; + for (size_t i = 0; i < out_ndim; i ++) + total_elements *= out_shape[i]; + + auto input_strides = input_desc->strides(); + auto other_strides = other_desc->strides(); + auto out_strides = out_desc->strides(); + + return utils::Result(InnerInfo{ + dtype, + input_ndim, other_ndim, out_ndim, + oper_len, total_elements, out_shape, + input_strides, other_strides, out_strides + }); + } +}; + +} // namespace op::inner + +#endif // __INNER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h new file mode 100644 index 000000000..d54daba07 --- /dev/null +++ b/src/infiniop/ops/inner/inner.h @@ -0,0 +1,49 @@ +#ifndef Inner_H +#define Inner_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::inner::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + InnerInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + InnerInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t other_desc, \ + infiniopTensorDescriptor_t out_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + const void *input, \ + const void *other, \ + void *out, \ + void *stream) const; \ + }; \ + } + + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu new file mode 100644 index 000000000..e48df7dd4 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -0,0 +1,128 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "inner_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::inner::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc, + infiniopTensorDescriptor_t out_desc) { + + auto result = InnerInfo::create(input_desc, other_desc, out_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_cuda = input_strides_cuda + input_ndim; + ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(out_strides_cuda, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_cuda, + input_strides_cuda, other_strides_cuda, out_strides_cuda, + input_ndim, other_ndim, out_ndim); + + // CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const void *other, + void *out, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh new file mode 100644 index 000000000..11033c2c0 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INNER_NVIDIA_H__ +#define __INNER_NVIDIA_H__ +#include "../inner.h" + +DESCRIPTOR(nvidia); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc new file mode 100644 index 000000000..8701eb38a --- /dev/null +++ b/src/infiniop/ops/inner/operator.cc @@ -0,0 +1,208 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/inner.h" + +#ifdef ENABLE_CPU_API +#include "cpu/inner_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/inner_nvidia.cuh" +#endif +// #ifdef ENABLE_METAX_API +// #include "metax/inner_metax.h" +// #endif +// #ifdef ENABLE_ASCEND_API +// #include "ascend/inner_ascend.h" +// #endif +// #ifdef ENABLE_CAMBRICON_API +// #include "bang/inner_bang.h" +// #endif +// #ifdef ENABLE_KUNLUN_API +// #include "kunlun/inner_kunlun.h" +// #endif +// #ifdef ENABLE_MOORE_API +// #include "moore/inner_moore.h" +// #endif + +__C infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc, + infiniopTensorDescriptor_t out_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::inner::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + input_desc, \ + other_desc, \ + out_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// CREATE(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// CREATE(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// CREATE(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// CREATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// CREATE(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// CREATE(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// GET(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// GET(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// GET(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// GET(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_API +// GET(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_KUNLUN_API +// GET(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// GET(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, size_t workspace_size, + const void *input, + const void *other, + void *out, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, input, other, out, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// CALCULATE(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// CALCULATE(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// CALCULATE(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// CALCULATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// CALCULATE(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +// #ifdef ENABLE_ILUVATAR_API +// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_QY_API +// DESTROY(INFINI_DEVICE_QY, nvidia); +// #endif +// #ifdef ENABLE_HYGON_API +// DESTROY(INFINI_DEVICE_HYGON, nvidia); +// #endif +// #ifdef ENABLE_CAMBRICON_API +// DESTROY(INFINI_DEVICE_CAMBRICON, bang) +// #endif +// #ifdef ENABLE_METAX_API +// DESTROY(INFINI_DEVICE_METAX, metax) +// #endif +// #ifdef ENABLE_ASCEND_API +// DESTROY(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_KUNLUN_API +// DESTROY(INFINI_DEVICE_KUNLUN, kunlun) +// #endif +// #ifdef ENABLE_MOORE_API +// DESTROY(INFINI_DEVICE_MOORE, moore) +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc index af5a74683..ae74587d8 100644 --- a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc @@ -18,22 +18,24 @@ infiniStatus_t Descriptor::create( } namespace { - template - infiniStatus_t maskedSelect(const MaskedSelectInfo *info, T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { - std::vector res; - for (size_t index = 0; index < info->total_elements; index ++) { - size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); - size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); - if (mask[mask_offset]) { - res.push_back(input[input_offset]); - } + +template +infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { + std::vector res; + for (size_t index = 0; index < info->total_elements; index ++) { + size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); + size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); + if (mask[mask_offset]) { + res.push_back(input[input_offset]); } - *dlen_ptr = res.size(); - *data_ptr = new T[*dlen_ptr]; - std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr)); - - return INFINI_STATUS_SUCCESS; } + *dlen_ptr = res.size(); + *data_ptr = new T[*dlen_ptr]; + std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr)); + + return INFINI_STATUS_SUCCESS; +} + } infiniStatus_t Descriptor::calculate( @@ -45,7 +47,7 @@ infiniStatus_t Descriptor::calculate( void *stream) const { if (_info.dtype == INFINI_DTYPE_F32) { - CHECK_STATUS(maskedSelect(&_info, (float*)input, (bool*)mask, data_ptr, dlen_ptr)); + CHECK_STATUS(maskedSelect(&_info, (const float *)input, mask, data_ptr, dlen_ptr)); } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu index 8dab98b41..3b5d540d6 100644 --- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -128,7 +128,7 @@ infiniStatus_t Descriptor::calculate( #define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ launchKernel( \ _info, \ - (const T *)input, mask, data_ptr, dlen_ptr, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ stream, workspace, workspace_size \ ) #define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index 3f75a6b64..166fabe33 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -62,9 +62,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.inner(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.inner(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.inner(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py index 94ca8d02c..27c28842b 100644 --- a/test/infinicore/ops/masked_select.py +++ b/test/infinicore/ops/masked_select.py @@ -15,7 +15,6 @@ ((1, 6), None, (1, 6)), ((4, 4), None, (4, 4)), ((2, 3, 2), None, (2, 3, 2)), - ((100, 100, 100), None, (100, 100, 100)) ] _TOLERANCE_MAP = {infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}} From 622daad4d4d7d306c2e046b706e36d3cda82cfce Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Sun, 7 Dec 2025 11:32:29 +0800 Subject: [PATCH 04/12] feature: Added operator support to match torch.tan and torch.tanhshrink on device CPU, nvidia --- include/infinicore/ops/cat.hpp | 2 +- include/infinicore/ops/inner.hpp | 4 +- include/infinicore/ops/tan.hpp | 17 ++ include/infinicore/ops/tanhshrink.hpp | 16 ++ include/infiniop.h | 2 + include/infiniop/ops/inner.h | 6 +- include/infiniop/ops/tan.h | 26 +++ include/infiniop/ops/tanhshrink.h | 24 +++ python/infinicore/__init__.py | 2 + python/infinicore/nn/functional/__init__.py | 2 + python/infinicore/nn/functional/tanhshrink.py | 11 ++ python/infinicore/ops/cat.py | 4 +- python/infinicore/ops/inner.py | 4 +- python/infinicore/ops/tan.py | 11 ++ src/infinicore/ops/cat/cat.cc | 4 +- src/infinicore/ops/inner/inner.cc | 10 +- src/infinicore/ops/inner/inner_infiniop.cc | 8 +- src/infinicore/ops/tan/tan.cc | 32 ++++ src/infinicore/ops/tan/tan_infiniop.cc | 52 ++++++ src/infinicore/ops/tanhshrink/tanhshrink.cc | 32 ++++ .../ops/tanhshrink/tanhshrink_infiniop.cc | 52 ++++++ src/infinicore/pybind11/ops.hpp | 4 + src/infinicore/pybind11/ops/cat.hpp | 2 +- src/infinicore/pybind11/ops/inner.hpp | 2 +- src/infinicore/pybind11/ops/tan.hpp | 24 +++ src/infinicore/pybind11/ops/tanhshrink.hpp | 24 +++ src/infiniop/ops/inner/cpu/inner_cpu.cc | 8 +- src/infiniop/ops/inner/info.h | 4 +- src/infiniop/ops/inner/inner.h | 6 +- src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 8 +- src/infiniop/ops/inner/operator.cc | 12 +- src/infiniop/ops/tan/cpu/tan_cpu.cc | 50 ++++++ src/infiniop/ops/tan/cpu/tan_cpu.h | 23 +++ src/infiniop/ops/tan/cuda/kernel.cuh | 30 ++++ src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 57 +++++++ src/infiniop/ops/tan/nvidia/tan_nvidia.cuh | 8 + src/infiniop/ops/tan/operator.cc | 157 ++++++++++++++++++ .../ops/tanhshrink/cpu/tanhshrink_cpu.cc | 50 ++++++ .../ops/tanhshrink/cpu/tanhshrink_cpu.h | 23 +++ src/infiniop/ops/tanhshrink/cuda/kernel.cuh | 30 ++++ .../tanhshrink/nvidia/tanhshrink_nvidia.cu | 59 +++++++ .../tanhshrink/nvidia/tanhshrink_nvidia.cuh | 8 + src/infiniop/ops/tanhshrink/operator.cc | 157 ++++++++++++++++++ test/infinicore/ops/tan.py | 6 +- test/infinicore/ops/tanhshrink.py | 6 +- 45 files changed, 1031 insertions(+), 48 deletions(-) create mode 100644 include/infinicore/ops/tan.hpp create mode 100644 include/infinicore/ops/tanhshrink.hpp create mode 100644 include/infiniop/ops/tan.h create mode 100644 include/infiniop/ops/tanhshrink.h create mode 100644 python/infinicore/nn/functional/tanhshrink.py create mode 100644 python/infinicore/ops/tan.py create mode 100644 src/infinicore/ops/tan/tan.cc create mode 100644 src/infinicore/ops/tan/tan_infiniop.cc create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink.cc create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/tan.hpp create mode 100644 src/infinicore/pybind11/ops/tanhshrink.hpp create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh create mode 100644 src/infiniop/ops/tan/operator.cc create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h create mode 100644 src/infiniop/ops/tanhshrink/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh create mode 100644 src/infiniop/ops/tanhshrink/operator.cc diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp index aa7dafd6c..5fdfdcce8 100644 --- a/include/infinicore/ops/cat.hpp +++ b/include/infinicore/ops/cat.hpp @@ -5,5 +5,5 @@ namespace infinicore::op { Tensor cat(std::vector tensors, int dim); -void cat_(std::vector tensors, int dim, Tensor out); +void cat_(Tensor out, std::vector tensors, int dim); } // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp index 87bdd2919..017ddc604 100644 --- a/include/infinicore/ops/inner.hpp +++ b/include/infinicore/ops/inner.hpp @@ -7,11 +7,11 @@ namespace infinicore::op { class Inner { public: using schema = void (*)(Tensor, Tensor, Tensor); - static void execute(Tensor input, Tensor other, Tensor out); + static void execute(Tensor out, Tensor input, Tensor other); static common::OpDispatcher &dispatcher(); }; Tensor inner(Tensor input, Tensor other); -void inner_(Tensor input, Tensor other, Tensor out); +void inner_(Tensor out, Tensor input, Tensor other); } \ No newline at end of file diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp new file mode 100644 index 000000000..9c2d7a860 --- /dev/null +++ b/include/infinicore/ops/tan.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tan { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tan(Tensor input); +void tan_(Tensor output, Tensor input); + +} \ No newline at end of file diff --git a/include/infinicore/ops/tanhshrink.hpp b/include/infinicore/ops/tanhshrink.hpp new file mode 100644 index 000000000..83673b22c --- /dev/null +++ b/include/infinicore/ops/tanhshrink.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tanhshrink { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tanhshrink(Tensor input); +void tanhshrink_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index f394dbc28..594795429 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -28,7 +28,9 @@ #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" +#include "infiniop/ops/tanhshrink.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h index 14dff5a8a..f484cde9c 100644 --- a/include/infiniop/ops/inner.h +++ b/include/infiniop/ops/inner.h @@ -8,9 +8,9 @@ typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t; __C __export infiniStatus_t infiniopCreateInnerDescriptor( infiniopHandle_t handle, infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t other_desc, - infiniopTensorDescriptor_t out_desc); + infiniopTensorDescriptor_t other_desc); __C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); @@ -18,9 +18,9 @@ __C __export infiniStatus_t infiniopInner( infiniopInnerDescriptor_t desc, void *workspace, size_t workspace_size, + void *out, const void *input, const void *other, - void *out, void *stream); __C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h new file mode 100644 index 000000000..2e20c61da --- /dev/null +++ b/include/infiniop/ops/tan.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_TAN_API_H__ +#define __INFINIOP_TAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc); + +__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h new file mode 100644 index 000000000..e136f6523 --- /dev/null +++ b/include/infiniop/ops/tanhshrink.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANHSHRINK_API_H__ +#define __INFINIOP_TANHSHRINK_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); + +__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 86b2e43ff..08634156b 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -47,6 +47,7 @@ from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange +from infinicore.ops.tan import tan from infinicore.tensor import ( Tensor, empty, @@ -109,6 +110,7 @@ "mul", "narrow", "rearrange", + "tan", "empty", "empty_like", "from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..4221bdce5 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,7 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .tanhshrink import tanhshrink __all__ = [ "causal_softmax", @@ -17,4 +18,5 @@ "embedding", "rope", "RopeAlgo", + "tanhshrink" ] diff --git a/python/infinicore/nn/functional/tanhshrink.py b/python/infinicore/nn/functional/tanhshrink.py new file mode 100644 index 000000000..10d17077d --- /dev/null +++ b/python/infinicore/nn/functional/tanhshrink.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tanhshrink(input, *, out=None): + if out is None: + return Tensor(_infinicore.tanhshrink(input._underlying)) + + _infinicore.tanhshrink_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py index 5fc7536a9..7bba9822b 100644 --- a/python/infinicore/ops/cat.py +++ b/python/infinicore/ops/cat.py @@ -2,10 +2,10 @@ from infinicore.tensor import Tensor -def cat(tensors, dim:int=0, out=None): +def cat(tensors, dim:int=0, *,out=None): if out is None: return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) - _infinicore.cat_([tensor._underlying for tensor in tensors], dim, out._underlying) + _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim) return out diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py index 09d456177..df87c5b34 100644 --- a/python/infinicore/ops/inner.py +++ b/python/infinicore/ops/inner.py @@ -2,10 +2,10 @@ from infinicore.tensor import Tensor -def inner(input, other, out=None): +def inner(input, other, *, out=None): if out is None: return Tensor(_infinicore.inner(input._underlying, other._underlying)) - _infinicore.inner_(input._underlying, other._underlying, out._underlying) + _infinicore.inner_(out._underlying, input._underlying, other._underlying) return out diff --git a/python/infinicore/ops/tan.py b/python/infinicore/ops/tan.py new file mode 100644 index 000000000..da438e6d8 --- /dev/null +++ b/python/infinicore/ops/tan.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tan(input, *, out=None): + if out is None: + return Tensor(_infinicore.tan(input._underlying)) + + _infinicore.tan_(out._underlying, input._underlying) + + return out diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc index 29daa94b3..71de1e5f6 100644 --- a/src/infinicore/ops/cat/cat.cc +++ b/src/infinicore/ops/cat/cat.cc @@ -62,11 +62,11 @@ Tensor cat(std::vector tensors, int dim) { } auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device()); - cat_(tensors, dim, out); + cat_(out, tensors, dim); return out; } -void cat_(std::vector tensors, int dim, Tensor out) { +void cat_(Tensor out, std::vector tensors, int dim) { assert(tensors.size() >= 2); int ndim = out->ndim(); assert(-ndim <= dim && dim < ndim); diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc index 606de26a4..271ad101c 100644 --- a/src/infinicore/ops/inner/inner.cc +++ b/src/infinicore/ops/inner/inner.cc @@ -9,7 +9,7 @@ common::OpDispatcher &Inner::dispatcher() { return dispatcher_; }; -void Inner::execute(Tensor input, Tensor other, Tensor out) { +void Inner::execute(Tensor out, Tensor input, Tensor other) { auto device_type = context::getDevice().getType(); auto func = dispatcher().lookup(device_type); @@ -17,7 +17,7 @@ void Inner::execute(Tensor input, Tensor other, Tensor out) { throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast(device_type))); } - func(input, other, out); + func(out, input, other); } Tensor inner(Tensor input, Tensor other) { @@ -33,13 +33,13 @@ Tensor inner(Tensor input, Tensor other) { out_shape.push_back(other->shape()[i]); auto out = Tensor::zeros(out_shape, input->dtype(), input->device()); - inner_(input, other, out); + inner_(out, input, other); return out; } -void inner_(Tensor input, Tensor other, Tensor out) { +void inner_(Tensor out, Tensor input, Tensor other) { - Inner::execute(input, other, out); + Inner::execute(out, input, other); } diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc index 45cb228bc..eb44062d6 100644 --- a/src/infinicore/ops/inner/inner_infiniop.cc +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -15,8 +15,8 @@ thread_local common::OpCache caches( } }); -void calculate(Tensor input, Tensor other, Tensor out) { - size_t seed = hash_combine(input, other, out); +void calculate(Tensor out, Tensor input, Tensor other) { + size_t seed = hash_combine(out, input, other); auto device_type = context::getDevice().getType(); auto device_index = context::getDevice().getIndex(); @@ -29,7 +29,7 @@ void calculate(Tensor input, Tensor other, Tensor out) { if (!desc_opt) { INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor( context::getInfiniopHandle(input->device()), &desc, - input->desc(), other->desc(), out->desc())); + out->desc(), input->desc(), other->desc())); cache.put(seed, desc); } else { desc = *desc_opt; @@ -42,7 +42,7 @@ void calculate(Tensor input, Tensor other, Tensor out) { INFINICORE_CHECK_ERROR(infiniopInner( desc, workspace->data(), workspace_size, - input->data(), other->data(), out->data(), context::getStream())); + out->data(), input->data(), other->data(), context::getStream())); } static bool registered = []() { diff --git a/src/infinicore/ops/tan/tan.cc b/src/infinicore/ops/tan/tan.cc new file mode 100644 index 000000000..20bcab4b5 --- /dev/null +++ b/src/infinicore/ops/tan/tan.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tan::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tan::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tan implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tan(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tan_(output, input); + return output; +} + +void tan_(Tensor output, Tensor input) { + Tan::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc new file mode 100644 index 000000000..cf3fed96c --- /dev/null +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op::tan_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTan( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tan::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::tan_impl::infiniop diff --git a/src/infinicore/ops/tanhshrink/tanhshrink.cc b/src/infinicore/ops/tanhshrink/tanhshrink.cc new file mode 100644 index 000000000..ea96e46f4 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tanhshrink::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tanhshrink::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tanhshrink implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tanhshrink(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tanhshrink_(output, input); + return output; +} + +void tanhshrink_(Tensor output, Tensor input) { + Tanhshrink::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc new file mode 100644 index 000000000..1a8885386 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op::tanhshrink_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanhshrinkDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanhshrinkDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanhshrinkDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanhshrinkDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanhshrinkWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTanhshrink( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tanhshrink::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::tanhshrink_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 1dea7b249..266eb6919 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -18,6 +18,8 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/tan.hpp" +#include "ops/tanhshrink.hpp" namespace py = pybind11; @@ -38,6 +40,8 @@ inline void bind(py::module &m) { bind_rms_norm(m); bind_silu(m); bind_swiglu(m); + bind_tan(m); + bind_tanhshrink(m); bind_rope(m); bind_embedding(m); } diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp index 8dd06bd88..2ef3392b0 100644 --- a/src/infinicore/pybind11/ops/cat.hpp +++ b/src/infinicore/pybind11/ops/cat.hpp @@ -17,9 +17,9 @@ inline void bind_cat(py::module &m) { m.def("cat_", &op::cat_, + py::arg("out"), py::arg("tensors"), py::arg("dim") = 0, - py::arg("out"), R"doc(opertor: torch.cat, in-place mode)doc"); } diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp index 8a81e59ee..16ee91932 100644 --- a/src/infinicore/pybind11/ops/inner.hpp +++ b/src/infinicore/pybind11/ops/inner.hpp @@ -17,9 +17,9 @@ inline void bind_inner(py::module &m) { m.def("inner_", &op::inner_, + py::arg("out"), py::arg("input"), py::arg("other"), - py::arg("out"), R"doc(opertor: torch.inner, in-place mode)doc"); } diff --git a/src/infinicore/pybind11/ops/tan.hpp b/src/infinicore/pybind11/ops/tan.hpp new file mode 100644 index 000000000..acd62e048 --- /dev/null +++ b/src/infinicore/pybind11/ops/tan.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tan.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tan(py::module &m) { + m.def("tan", + &op::tan, + py::arg("input"), + R"doc(opertor: torch.tan, out-of-place mode)doc"); + + m.def("tan_", + &op::tan_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tan, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/tanhshrink.hpp b/src/infinicore/pybind11/ops/tanhshrink.hpp new file mode 100644 index 000000000..3780555f4 --- /dev/null +++ b/src/infinicore/pybind11/ops/tanhshrink.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tanhshrink.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tanhshrink(py::module &m) { + m.def("tanhshrink", + &op::tanhshrink, + py::arg("input"), + R"doc(opertor: torch.tanhshrink, out-of-place mode)doc"); + + m.def("tanhshrink_", + &op::tanhshrink_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tanhshrink, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc index 4c35036ea..5915cb3fc 100644 --- a/src/infiniop/ops/inner/cpu/inner_cpu.cc +++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc @@ -8,11 +8,11 @@ Descriptor::~Descriptor() {} infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t other_desc, - infiniopTensorDescriptor_t out_desc) { + infiniopTensorDescriptor_t other_desc) { - auto result = InnerInfo::create(input_desc, other_desc, out_desc); + auto result = InnerInfo::create(out_desc, input_desc, other_desc); CHECK_RESULT(result); *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); return INFINI_STATUS_SUCCESS; @@ -67,9 +67,9 @@ infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *o infiniStatus_t Descriptor::calculate( void* workspace, size_t workspace_size, + void *out, const void *input, const void *other, - void *out, void *stream) const { if (_info.dtype == INFINI_DTYPE_BF16) { diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h index f69348b83..2484277eb 100644 --- a/src/infiniop/ops/inner/info.h +++ b/src/infiniop/ops/inner/info.h @@ -27,9 +27,9 @@ class InnerInfo { std::vector out_strides; static utils::Result create( + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t other_desc, - infiniopTensorDescriptor_t out_desc) { + infiniopTensorDescriptor_t other_desc) { auto dtype = out_desc->dtype(); if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h index d54daba07..5c44262a7 100644 --- a/src/infiniop/ops/inner/inner.h +++ b/src/infiniop/ops/inner/inner.h @@ -32,15 +32,15 @@ static infiniStatus_t create( \ infiniopHandle_t handle, \ Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ infiniopTensorDescriptor_t input_desc, \ - infiniopTensorDescriptor_t other_desc, \ - infiniopTensorDescriptor_t out_desc); \ + infiniopTensorDescriptor_t other_desc); \ \ infiniStatus_t calculate( \ void *workspace, size_t workspace_size, \ + void *out, \ const void *input, \ const void *other, \ - void *out, \ void *stream) const; \ }; \ } diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu index e48df7dd4..db8927ca7 100644 --- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -21,11 +21,11 @@ Descriptor::~Descriptor() { infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t other_desc, - infiniopTensorDescriptor_t out_desc) { + infiniopTensorDescriptor_t other_desc) { - auto result = InnerInfo::create(input_desc, other_desc, out_desc); + auto result = InnerInfo::create(out_desc, input_desc, other_desc); CHECK_RESULT(result); auto info = result.take(); size_t workspace_size = 0; @@ -89,9 +89,9 @@ infiniStatus_t launchKernel( infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, + void *out, const void *input, const void *other, - void *out, void *stream_) const { cudaStream_t stream = (cudaStream_t)stream_; diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc index 8701eb38a..7b9cf20c3 100644 --- a/src/infiniop/ops/inner/operator.cc +++ b/src/infiniop/ops/inner/operator.cc @@ -27,18 +27,18 @@ __C infiniStatus_t infiniopCreateInnerDescriptor( infiniopHandle_t handle, infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, - infiniopTensorDescriptor_t other_desc, - infiniopTensorDescriptor_t out_desc) { + infiniopTensorDescriptor_t other_desc) { #define CREATE(CASE, NAMESPACE) \ case CASE: \ return op::inner::NAMESPACE::Descriptor::create( \ handle, \ reinterpret_cast(desc_ptr), \ + out_desc, \ input_desc, \ - other_desc, \ - out_desc); + other_desc); switch (handle->device) { #ifdef ENABLE_CPU_API @@ -120,15 +120,15 @@ __C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, __C infiniStatus_t infiniopInner( infiniopInnerDescriptor_t desc, void *workspace, size_t workspace_size, + void *out, const void *input, const void *other, - void *out, void *stream) { #define CALCULATE(CASE, NAMESPACE) \ case CASE: \ return reinterpret_cast(desc)->calculate( \ - workspace, workspace_size, input, other, out, stream); + workspace, workspace_size, out, input, other, stream); switch (desc->device_type) { #ifdef ENABLE_CPU_API diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc new file mode 100644 index 000000000..90939f9cb --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -0,0 +1,50 @@ +#include "tan_cpu.h" + +namespace op::tan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h new file mode 100644 index 000000000..04004dd19 --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TAN_CPU_H__ +#define __TAN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tan, cpu) + +#include + +namespace op::tan::cpu { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::tan(x); + } +} TanOp; + +} // namespace op::tan::cpu + +#endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh new file mode 100644 index 000000000..12c6d59e7 --- /dev/null +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TAN_CUDA_H__ +#define __TAN_CUDA_H__ + +#include + +namespace op::tan::cuda { + +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return __tanf(x); + } + } +} TanOp; + +} // namespace op::tan::cuda + +#endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu new file mode 100644 index 000000000..510f3d416 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tan_nvidia.cuh" + +namespace op::tan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh new file mode 100644 index 000000000..1915cc2b1 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAN_CUDA_API_H__ +#define __TAN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tan, nvidia) + +#endif // __TAN_CUDA_API_H__ diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc new file mode 100644 index 000000000..fc232fb1c --- /dev/null +++ b/src/infiniop/ops/tan/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tan_nvidia.cuh" +#endif +// #ifdef ENABLE_METAX_API +// #include "metax/tan_metax.h" +// #endif +// #ifdef ENABLE_MOORE_API +// #include "moore/tan_moore.h" +// #endif + +__C infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// CREATE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// GET(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// GET(INFINI_DEVICE_MOORE, moore); +// #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// CALCULATE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// DELETE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// DELETE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc new file mode 100644 index 000000000..1abca6a33 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc @@ -0,0 +1,50 @@ +#include "tanhshrink_cpu.h" + +namespace op::tanhshrink::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::cpu diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h new file mode 100644 index 000000000..8c5f722ad --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TANHSHRINK_CPU_H__ +#define __TANHSHRINK_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, cpu) + +#include + +namespace op::tanhshrink::cpu { +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x - tanh(x); + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cpu + +#endif // __TANHSHRINK_CPU_H__ diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh new file mode 100644 index 000000000..32a79757b --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TANHSHRINK_CUDA_H__ +#define __TANHSHRINK_CUDA_H__ + +#include + +namespace op::tanhshrink::cuda { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f - __tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - __tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - __tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cuda + +#endif // __TANHSHRINK_CUDA_H__ diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu new file mode 100644 index 000000000..baff908a2 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_nvidia.cuh" + +namespace op::tanhshrink::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhshrinkOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::nvidia diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh new file mode 100644 index 000000000..abe56f444 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_CUDA_API_H__ +#define __TANHSHRINK_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, nvidia) + +#endif // __TANHSHRINK_CUDA_API_H__ diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc new file mode 100644 index 000000000..87de46704 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanhshrink.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanhshrink_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanhshrink_nvidia.cuh" +#endif +// #ifdef ENABLE_METAX_API +// #include "metax/tanhshrink_metax.h" +// #endif +// #ifdef ENABLE_MOORE_API +// #include "moore/tanhshrink_moore.h" +// #endif + +__C infiniStatus_t infiniopCreateTanhshrinkDescriptor( + infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanhshrink::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// CREATE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// GET(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// GET(INFINI_DEVICE_MOORE, moore); +// #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanhshrink( + infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// CALCULATE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +// #ifdef ENABLE_ILUVATAR_API +// DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// DELETE(INFINI_DEVICE_METAX, metax); +// #endif +// #ifdef ENABLE_MOORE_API +// DELETE(INFINI_DEVICE_MOORE, moore); +// #endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py index c9948c3c6..4c786d114 100644 --- a/test/infinicore/ops/tan.py +++ b/test/infinicore/ops/tan.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.tan(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.tan(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.tan(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py index ca3559f22..f85e36446 100644 --- a/test/infinicore/ops/tanhshrink.py +++ b/test/infinicore/ops/tanhshrink.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.tanhshrink(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.tanhshrink(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.tanhshrink(*args, **kwargs) def main(): From a9f58453a224b8cfa373e843875fb4db845c78ad Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Sun, 7 Dec 2025 17:57:50 +0800 Subject: [PATCH 05/12] fix: Optimize the cat operator --- src/infinicore/ops/cat/cat.cc | 131 +++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 35 deletions(-) diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc index 71de1e5f6..908f5f457 100644 --- a/src/infinicore/ops/cat/cat.cc +++ b/src/infinicore/ops/cat/cat.cc @@ -1,50 +1,110 @@ #include "infinicore/ops/cat.hpp" #include "infinicore/context/context.hpp" #include - +#include namespace infinicore::op { namespace { - void low_dim_copy(Tensor& tensor, int dim, Tensor& out, std::byte* tensor_ptr, std::byte* out_ptr, int depth) { - if (depth != out->ndim()) { - std::byte* now_tensor_ptr = tensor_ptr; - std::byte* now_out_ptr = out_ptr; - for (int i = 0; i < tensor->shape()[depth]; i ++) { - low_dim_copy(tensor, dim, out, now_tensor_ptr, now_out_ptr, depth + 1); - now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); - now_out_ptr += out->stride(depth) * dsize(out->dtype()); +class CatInfo { + + CatInfo() = default; + +public: + + int dim; + int ndim; + size_t tensors_size; + + std::vector contiguous_dim; + std::vector copy_size; + + static CatInfo create(Tensor& out, std::vector& tensors, int dim) { + + int ndim = out->ndim(); + size_t tensors_size = tensors.size(); + + std::vector contiguous_dim(tensors_size, ndim); + std::vector copy_size(tensors_size, dsize(out->dtype())); + + for (int i = 0; i < tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + if (tensors[i]->stride(ndim - 1) == 1) { + contiguous_dim[i] = ndim - 1; + for (int j = ndim - 2; j >= dim; j --) { + if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) { + contiguous_dim[i] = j; + } + } + } + for (int j = contiguous_dim[i]; j < ndim; j ++) { + copy_size[i] *= tensors[i]->shape()[j]; } - } else { - Size data_size = dsize(out->dtype()); - if (out->device().getType() == Device::Type::CPU) - std::memcpy(out_ptr, tensor_ptr, data_size); - else - context::memcpyD2D(out_ptr, tensor_ptr, data_size); } + + return CatInfo{dim, ndim, tensors_size, contiguous_dim, copy_size}; } +}; - void high_dim_split(std::vector& tensors, int dim, Tensor& out, std::vector tensors_ptr, std::byte* out_ptr, int depth) { - if (depth != dim) { - std::vector now_tensors_ptr = tensors_ptr; - std::byte* now_out_ptr = out_ptr; - for (int i = 0; i < out->shape()[depth]; i ++) { - high_dim_split(tensors, dim, out, now_tensors_ptr, now_out_ptr, depth + 1); - for (int i = 0; i < tensors.size(); i ++) { - if (tensors[i]->ndim() == 1) continue; - now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); - } - now_out_ptr += out->stride(depth) * dsize(out->dtype()); - } + +void low_dim_copy( + CatInfo &info, Tensor& tensor, Tensor& out, + std::byte *tensor_ptr, std::byte *out_ptr, + int depth, int tensor_pos) { + + if (depth != info.contiguous_dim[tensor_pos]) { + std::byte* now_tensor_ptr = tensor_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < tensor->shape()[depth]; i ++) { + + low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos); + + now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + if (out->device().getType() == Device::Type::CPU) { + + std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); } else { - std::byte* now_out_ptr = out_ptr; - for (int i = 0; i < tensors.size(); i ++) { + + context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); + } + } +} + +void high_dim_split( + CatInfo &info, std::vector& tensors, Tensor& out, + std::vector tensors_ptr, std::byte* out_ptr, + int depth) { + + if (depth != info.dim) { + std::vector now_tensors_ptr = tensors_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < out->shape()[depth]; i ++) { + + high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1); + + for (int i = 0; i < info.tensors_size; i ++) { if (tensors[i]->ndim() == 1) continue; - low_dim_copy(tensors[i], dim, out, tensors_ptr[i], now_out_ptr, depth); - now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); + now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); } + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < info.tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + + low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i); + + now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); } } +} } // namespace @@ -67,13 +127,13 @@ Tensor cat(std::vector tensors, int dim) { } void cat_(Tensor out, std::vector tensors, int dim) { + // assert the parameter properties are correct. assert(tensors.size() >= 2); int ndim = out->ndim(); assert(-ndim <= dim && dim < ndim); dim = (dim + ndim) % ndim; - - Size dim_shape = 0; + size_t dim_shape = 0; for (auto& tensor : tensors) { assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1); if (tensor->ndim() == 1) { @@ -90,14 +150,15 @@ void cat_(Tensor out, std::vector tensors, int dim) { } assert(dim_shape == out->shape()[dim]); + // Get info + CatInfo info = CatInfo::create(out, tensors, dim); std::vector tensors_ptr(tensors.size()); - for (int i = 0; i < tensors.size(); i ++) { tensors_ptr[i] = tensors[i]->data(); } std::byte* out_ptr = out->data(); - high_dim_split(tensors, dim, out, tensors_ptr, out_ptr, 0); + high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0); } From 6b2410c86a0377b2dd1746455e9894d14851e849 Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Tue, 9 Dec 2025 18:04:44 +0800 Subject: [PATCH 06/12] feature: Added device moore support --- src/infinicore/ops/inner/inner_infiniop.cc | 4 +- .../masked_select/masked_select_infiniop.cc | 4 +- src/infinicore/ops/tan/tan_infiniop.cc | 8 +- .../ops/tanhshrink/tanhshrink_infiniop.cc | 8 +- src/infiniop/ops/inner/cuda/kernel.cuh | 1 - src/infiniop/ops/inner/moore/inner_kernel.h | 45 +++++ src/infiniop/ops/inner/moore/inner_moore.h | 7 + src/infiniop/ops/inner/moore/inner_moore.mu | 123 ++++++++++++++ src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 3 - src/infiniop/ops/inner/operator.cc | 30 ++-- .../ops/masked_select/cuda/kernel.cuh | 1 - .../moore/masked_select_kernel.h | 101 ++++++++++++ .../masked_select/moore/masked_select_moore.h | 7 + .../moore/masked_select_moore.mu | 156 ++++++++++++++++++ .../nvidia/masked_select_nvidia.cu | 5 +- src/infiniop/ops/masked_select/operator.cc | 30 ++-- src/infiniop/ops/tan/cuda/kernel.cuh | 2 - src/infiniop/ops/tan/moore/tan_moore.h | 8 + src/infiniop/ops/tan/moore/tan_moore.mu | 56 +++++++ src/infiniop/ops/tan/moore/tan_moore_kernel.h | 28 ++++ src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 2 +- src/infiniop/ops/tan/operator.cc | 30 ++-- src/infiniop/ops/tanhshrink/cuda/kernel.cuh | 2 - .../ops/tanhshrink/moore/tanhshrink_moore.h | 8 + .../ops/tanhshrink/moore/tanhshrink_moore.mu | 56 +++++++ .../moore/tanhshrink_moore_kernel.h | 29 ++++ src/infiniop/ops/tanhshrink/operator.cc | 30 ++-- 27 files changed, 704 insertions(+), 80 deletions(-) create mode 100644 src/infiniop/ops/inner/moore/inner_kernel.h create mode 100644 src/infiniop/ops/inner/moore/inner_moore.h create mode 100644 src/infiniop/ops/inner/moore/inner_moore.mu create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_kernel.h create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.h create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.mu create mode 100644 src/infiniop/ops/tan/moore/tan_moore.h create mode 100644 src/infiniop/ops/tan/moore/tan_moore.mu create mode 100644 src/infiniop/ops/tan/moore/tan_moore_kernel.h create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc index eb44062d6..4d60ff1f9 100644 --- a/src/infinicore/ops/inner/inner_infiniop.cc +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -48,9 +48,9 @@ void calculate(Tensor out, Tensor input, Tensor other) { static bool registered = []() { Inner::dispatcher().registerDevice({ Device::Type::CPU, - Device::Type::NVIDIA + Device::Type::NVIDIA, // Device::Type::METAX, - // Device::Type::MOORE, + Device::Type::MOORE // Device::Type::ILUVATAR }, &calculate, false); return true; diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc index 352ef0b8f..8c3f0e523 100644 --- a/src/infinicore/ops/masked_select/masked_select_infiniop.cc +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -48,9 +48,9 @@ void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { static bool registered = []() { MaskedSelect::dispatcher().registerDevice({ Device::Type::CPU, - Device::Type::NVIDIA + Device::Type::NVIDIA, // Device::Type::METAX, - // Device::Type::MOORE, + Device::Type::MOORE // Device::Type::ILUVATAR }, &calculate, false); return true; diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc index cf3fed96c..faf6898c5 100644 --- a/src/infinicore/ops/tan/tan_infiniop.cc +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -45,7 +45,13 @@ void calculate(Tensor output, Tensor input) { } static bool registered = []() { - Tan::dispatcher().registerAll(&calculate, false); + Tan::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + // Device::Type::METAX, + Device::Type::MOORE + // Device::Type::ILUVATAR + }, &calculate, false); return true; }(); diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc index 1a8885386..be79e2567 100644 --- a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -45,7 +45,13 @@ void calculate(Tensor output, Tensor input) { } static bool registered = []() { - Tanhshrink::dispatcher().registerAll(&calculate, false); + Tanhshrink::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + // Device::Type::METAX, + Device::Type::MOORE + // Device::Type::ILUVATAR + }, &calculate, false); return true; }(); diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh index 66661db98..1616a4281 100644 --- a/src/infiniop/ops/inner/cuda/kernel.cuh +++ b/src/infiniop/ops/inner/cuda/kernel.cuh @@ -1,6 +1,5 @@ #ifndef __INNER_KERNEL_CUH__ #define __INNER_KERNEL_CUH__ -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" template INFINIOP_CUDA_KERNEL innerKernel( diff --git a/src/infiniop/ops/inner/moore/inner_kernel.h b/src/infiniop/ops/inner/moore/inner_kernel.h new file mode 100644 index 000000000..a9c4150ce --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_kernel.h @@ -0,0 +1,45 @@ +#ifndef __INNER_KERNEL_H__ +#define __INNER_KERNEL_H__ + +template +INFINIOP_MOORE_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + // if constexpr (std::is_same_v || std::is_same_v) { + tmp += static_cast(input[input_offset]) * static_cast(other[other_offset]); + // } else { + // tmp += input[input_offset] * other[other_offset]; + // } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = static_cast(tmp); +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h new file mode 100644 index 000000000..a508a6718 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.h @@ -0,0 +1,7 @@ +#ifndef __INNER_MOORE_H__ +#define __INNER_MOORE_H__ +#include "../inner.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu new file mode 100644 index 000000000..530a90713 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.mu @@ -0,0 +1,123 @@ +#include "../../../devices/moore/moore_common.h" +#include "inner_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "inner_kernel.h" + +namespace op::inner::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_musa = input_strides_musa + input_ndim; + ptrdiff_t *out_strides_musa = other_strides_musa + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(out_shape_musa, info.out_shape.data(), out_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(other_strides_musa, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(out_strides_musa, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_musa, + input_strides_musa, other_strides_musa, out_strides_musa, + input_ndim, other_ndim, out_ndim); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu index db8927ca7..9152405e2 100644 --- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -2,9 +2,6 @@ #include "inner_nvidia.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include - -#include "../../../reduce/cuda/reduce.cuh" #include "../cuda/kernel.cuh" diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc index 7b9cf20c3..0d5736d16 100644 --- a/src/infiniop/ops/inner/operator.cc +++ b/src/infiniop/ops/inner/operator.cc @@ -20,9 +20,9 @@ // #ifdef ENABLE_KUNLUN_API // #include "kunlun/inner_kunlun.h" // #endif -// #ifdef ENABLE_MOORE_API -// #include "moore/inner_moore.h" -// #endif +#ifdef ENABLE_MOORE_API +#include "moore/inner_moore.h" +#endif __C infiniStatus_t infiniopCreateInnerDescriptor( infiniopHandle_t handle, @@ -68,9 +68,9 @@ __C infiniStatus_t infiniopCreateInnerDescriptor( // #ifdef ENABLE_KUNLUN_API // CREATE(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// CREATE(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -110,9 +110,9 @@ __C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, // #ifdef ENABLE_KUNLUN_API // GET(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// GET(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -158,9 +158,9 @@ __C infiniStatus_t infiniopInner( // #ifdef ENABLE_KUNLUN_API // CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// CALCULATE(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -200,9 +200,9 @@ __C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc // #ifdef ENABLE_KUNLUN_API // DESTROY(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// DESTROY(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh index bd4e4c655..fc7485f31 100644 --- a/src/infiniop/ops/masked_select/cuda/kernel.cuh +++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh @@ -1,6 +1,5 @@ #ifndef __MASKED_SELECT_KERNEL_CUH__ #define __MASKED_SELECT_KERNEL_CUH__ -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" template INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( diff --git a/src/infiniop/ops/masked_select/moore/masked_select_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_kernel.h new file mode 100644 index 000000000..901cee34a --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_KERNEL_CUH__ +#define __MASKED_SELECT_KERNEL_CUH__ + +template +INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h new file mode 100644 index 000000000..24548d8fc --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_MOORE_H__ +#define __MASKED_SELECT_MOORE_H__ +#include "../masked_select.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu new file mode 100644 index 000000000..4434ce187 --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu @@ -0,0 +1,156 @@ +#include "../../../devices/moore/moore_common.h" +#include "masked_select_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_kernel.h" + +namespace op::masked_select::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(shape_musa, info.shape.data(), ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(mask_strides_musa, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_MOORE(musaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_MOORE(musaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream)); + + CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(MOORE); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_musa, input_strides_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu index 3b5d540d6..44e3592ca 100644 --- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -2,9 +2,6 @@ #include "masked_select_nvidia.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include - -#include "../../../reduce/cuda/reduce.cuh" #include "../cuda/kernel.cuh" @@ -103,7 +100,7 @@ infiniStatus_t launchKernel( CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaMallocAsync(data_ptr, *dlen_ptr * sizeof(T), stream)); + CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T))); maskedSelectGetDataKernel<<>>( input, mask, scan_result, (T *)*data_ptr, total_elements, diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc index bdd40972a..5afe0f106 100644 --- a/src/infiniop/ops/masked_select/operator.cc +++ b/src/infiniop/ops/masked_select/operator.cc @@ -20,9 +20,9 @@ // #ifdef ENABLE_KUNLUN_API // #include "kunlun/masked_select_kunlun.h" // #endif -// #ifdef ENABLE_MOORE_API -// #include "moore/masked_select_moore.h" -// #endif +#ifdef ENABLE_MOORE_API +#include "moore/masked_select_moore.h" +#endif __C infiniStatus_t infiniopCreateMaskedSelectDescriptor( infiniopHandle_t handle, @@ -66,9 +66,9 @@ __C infiniStatus_t infiniopCreateMaskedSelectDescriptor( // #ifdef ENABLE_KUNLUN_API // CREATE(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// CREATE(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -108,9 +108,9 @@ __C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDesc // #ifdef ENABLE_KUNLUN_API // GET(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// GET(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -157,9 +157,9 @@ __C infiniStatus_t infiniopMaskedSelect( // #ifdef ENABLE_KUNLUN_API // CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// CALCULATE(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -199,9 +199,9 @@ __C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDes // #ifdef ENABLE_KUNLUN_API // DESTROY(INFINI_DEVICE_KUNLUN, kunlun) // #endif -// #ifdef ENABLE_MOORE_API -// DESTROY(INFINI_DEVICE_MOORE, moore) -// #endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh index 12c6d59e7..757372a93 100644 --- a/src/infiniop/ops/tan/cuda/kernel.cuh +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -1,8 +1,6 @@ #ifndef __TAN_CUDA_H__ #define __TAN_CUDA_H__ -#include - namespace op::tan::cuda { typedef struct TanOp { diff --git a/src/infiniop/ops/tan/moore/tan_moore.h b/src/infiniop/ops/tan/moore/tan_moore.h new file mode 100644 index 000000000..c261e4a69 --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.h @@ -0,0 +1,8 @@ +#ifndef __TAN_MOORE_API_H__ +#define __TAN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, moore) + +#endif // __TAN_MOORE_API_H__ diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu new file mode 100644 index 000000000..ab35146ff --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "tan_moore_kernel.h" +#include "tan_moore.h" + +namespace op::tan::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::moore diff --git a/src/infiniop/ops/tan/moore/tan_moore_kernel.h b/src/infiniop/ops/tan/moore/tan_moore_kernel.h new file mode 100644 index 000000000..ad97cb4f6 --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore_kernel.h @@ -0,0 +1,28 @@ +#ifndef __TAN_MOORE_KERNEL_H__ +#define __TAN_MOORE_KERNEL_H__ + +namespace op::tan::moore { + +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return __tanf(x); + } + } +} TanOp; + +} // namespace op::tan::moore + +#endif // __TAN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu index 510f3d416..d5415cb62 100644 --- a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -20,7 +20,7 @@ infiniStatus_t Descriptor::create( const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index fc232fb1c..1cd652c73 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -11,9 +11,9 @@ // #ifdef ENABLE_METAX_API // #include "metax/tan_metax.h" // #endif -// #ifdef ENABLE_MOORE_API -// #include "moore/tan_moore.h" -// #endif +#ifdef ENABLE_MOORE_API +#include "moore/tan_moore.h" +#endif __C infiniStatus_t infiniopCreateTanDescriptor( infiniopHandle_t handle, @@ -43,9 +43,9 @@ __C infiniStatus_t infiniopCreateTanDescriptor( // #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// CREATE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -74,9 +74,9 @@ __C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, siz // #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// GET(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -112,9 +112,9 @@ __C infiniStatus_t infiniopTan( // #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// CALCULATE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -145,9 +145,9 @@ infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { // #ifdef ENABLE_METAX_API // DELETE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// DELETE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh index 32a79757b..83ae43c1a 100644 --- a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -1,8 +1,6 @@ #ifndef __TANHSHRINK_CUDA_H__ #define __TANHSHRINK_CUDA_H__ -#include - namespace op::tanhshrink::cuda { typedef struct TanhshrinkOp { diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h new file mode 100644 index 000000000..5a01a76ce --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_MOORE_API_H__ +#define __TANHSHRINK_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, moore) + +#endif // __TANHSHRINK_MOORE_API_H__ diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu new file mode 100644 index 000000000..37bc31f24 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "tanhshrink_moore_kernel.h" +#include "tanhshrink_moore.h" + +namespace op::tanhshrink::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::moore diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h new file mode 100644 index 000000000..b35f88048 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h @@ -0,0 +1,29 @@ +#ifndef __TANHSHRINK_MOORE_KERNEL_H__ +#define __TANHSHRINK_MOORE_KERNEL_H__ + +namespace op::tanhshrink::moore { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + + return __float2bfloat16(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::moore + +#endif // __TANHSHRINK_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc index 87de46704..f8c9a722c 100644 --- a/src/infiniop/ops/tanhshrink/operator.cc +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -11,9 +11,9 @@ // #ifdef ENABLE_METAX_API // #include "metax/tanhshrink_metax.h" // #endif -// #ifdef ENABLE_MOORE_API -// #include "moore/tanhshrink_moore.h" -// #endif +#ifdef ENABLE_MOORE_API +#include "moore/tanhshrink_moore.h" +#endif __C infiniStatus_t infiniopCreateTanhshrinkDescriptor( infiniopHandle_t handle, @@ -43,9 +43,9 @@ __C infiniStatus_t infiniopCreateTanhshrinkDescriptor( // #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// CREATE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -74,9 +74,9 @@ __C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescript // #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// GET(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -112,9 +112,9 @@ __C infiniStatus_t infiniopTanhshrink( // #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// CALCULATE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -145,9 +145,9 @@ infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { // #ifdef ENABLE_METAX_API // DELETE(INFINI_DEVICE_METAX, metax); // #endif -// #ifdef ENABLE_MOORE_API -// DELETE(INFINI_DEVICE_MOORE, moore); -// #endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; From 6eaf83952f27beda96ee4f9c77920e9c82c7cef8 Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Thu, 11 Dec 2025 07:54:36 +0000 Subject: [PATCH 07/12] feature: Added device metax support --- src/infinicore/ops/inner/inner_infiniop.cc | 6 +- .../masked_select/masked_select_infiniop.cc | 6 +- src/infinicore/ops/tan/tan_infiniop.cc | 6 +- .../ops/tanhshrink/tanhshrink_infiniop.cc | 6 +- src/infiniop/ops/inner/metax/inner_metax.h | 7 + src/infiniop/ops/inner/metax/inner_metax.maca | 123 ++++++++++++++ .../ops/inner/metax/inner_metax_kernel.h | 59 +++++++ src/infiniop/ops/inner/moore/inner_moore.mu | 2 +- .../{inner_kernel.h => inner_moore_kernel.h} | 4 +- src/infiniop/ops/inner/operator.cc | 123 +++----------- .../masked_select/metax/masked_select_metax.h | 7 + .../metax/masked_select_metax.maca | 154 ++++++++++++++++++ .../metax/masked_select_metax_kernel.h | 101 ++++++++++++ .../moore/masked_select_moore.mu | 2 +- ..._kernel.h => masked_select_moore_kernel.h} | 4 +- src/infiniop/ops/masked_select/operator.cc | 123 +++----------- src/infiniop/ops/tan/metax/tan_metax.h | 8 + src/infiniop/ops/tan/metax/tan_metax.maca | 56 +++++++ src/infiniop/ops/tan/moore/tan_moore.mu | 8 +- src/infiniop/ops/tan/operator.cc | 30 ++-- .../ops/tanhshrink/metax/tanhshrink_metax.h | 8 + .../tanhshrink/metax/tanhshrink_metax.maca | 56 +++++++ .../metax/tanhshrink_metax_kernel.h} | 20 +-- .../moore/tanhshrink_moore_kernel.h | 1 - src/infiniop/ops/tanhshrink/operator.cc | 30 ++-- test/infinicore/ops/inner.py | 2 +- 26 files changed, 696 insertions(+), 256 deletions(-) create mode 100644 src/infiniop/ops/inner/metax/inner_metax.h create mode 100644 src/infiniop/ops/inner/metax/inner_metax.maca create mode 100644 src/infiniop/ops/inner/metax/inner_metax_kernel.h rename src/infiniop/ops/inner/moore/{inner_kernel.h => inner_moore_kernel.h} (95%) create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.h create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.maca create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h rename src/infiniop/ops/masked_select/moore/{masked_select_kernel.h => masked_select_moore_kernel.h} (97%) create mode 100644 src/infiniop/ops/tan/metax/tan_metax.h create mode 100644 src/infiniop/ops/tan/metax/tan_metax.maca create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca rename src/infiniop/ops/{tan/moore/tan_moore_kernel.h => tanhshrink/metax/tanhshrink_metax_kernel.h} (56%) diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc index 4d60ff1f9..fe6b8de9a 100644 --- a/src/infinicore/ops/inner/inner_infiniop.cc +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -49,9 +49,9 @@ static bool registered = []() { Inner::dispatcher().registerDevice({ Device::Type::CPU, Device::Type::NVIDIA, - // Device::Type::METAX, - Device::Type::MOORE - // Device::Type::ILUVATAR + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR }, &calculate, false); return true; }(); diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc index 8c3f0e523..0144ce425 100644 --- a/src/infinicore/ops/masked_select/masked_select_infiniop.cc +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -49,9 +49,9 @@ static bool registered = []() { MaskedSelect::dispatcher().registerDevice({ Device::Type::CPU, Device::Type::NVIDIA, - // Device::Type::METAX, - Device::Type::MOORE - // Device::Type::ILUVATAR + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR }, &calculate, false); return true; }(); diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc index faf6898c5..1d7c9c874 100644 --- a/src/infinicore/ops/tan/tan_infiniop.cc +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -48,9 +48,9 @@ static bool registered = []() { Tan::dispatcher().registerDevice({ Device::Type::CPU, Device::Type::NVIDIA, - // Device::Type::METAX, - Device::Type::MOORE - // Device::Type::ILUVATAR + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR }, &calculate, false); return true; }(); diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc index be79e2567..ca440f875 100644 --- a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -48,9 +48,9 @@ static bool registered = []() { Tanhshrink::dispatcher().registerDevice({ Device::Type::CPU, Device::Type::NVIDIA, - // Device::Type::METAX, - Device::Type::MOORE - // Device::Type::ILUVATAR + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR }, &calculate, false); return true; }(); diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h new file mode 100644 index 000000000..04a47d445 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.h @@ -0,0 +1,7 @@ +#ifndef __INNER_METAX_H__ +#define __INNER_METAX_H__ +#include "../inner.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca new file mode 100644 index 000000000..683f4a1e1 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.maca @@ -0,0 +1,123 @@ +#include "../../../devices/metax/metax_common.h" +#include "inner_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +#include "inner_metax_kernel.h" + +namespace op::inner::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_hc = input_strides_hc + input_ndim; + ptrdiff_t *out_strides_hc = other_strides_hc + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(out_shape_hc, info.out_shape.data(), out_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(other_strides_hc, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(out_strides_hc, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_hc, + input_strides_hc, other_strides_hc, out_strides_hc, + input_ndim, other_ndim, out_ndim); + + // CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h new file mode 100644 index 000000000..52f37e1ef --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h @@ -0,0 +1,59 @@ +#ifndef __INNER_METAX_KERNEL_H__ +#define __INNER_METAX_KERNEL_H__ + +template +INFINIOP_METAX_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + // T tmp = input[input_offset] * other[other_offset]; + // input_offset += input_strides[input_ndim - 1]; + // other_offset += other_strides[other_ndim - 1]; + T tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + if constexpr (std::is_same_v || std::is_same_v) { + tmp += input[input_offset] * other[other_offset]; + } else { + // if (out_index == 0) { + // printf("input: %.10f\tother:%.10f\n", input[input_offset], other[other_offset]); + // } + T mul_a = __fmul_rn(input[input_offset], other[other_offset]); + // if (out_index == 0) { + // printf("mul answer: %.10f\n", mul_a); + // printf("tmp val: %.10f\n", tmp); + // } + tmp = __fadd_rd(tmp, mul_a); + // if (out_index == 0) + // printf("add answer: %.10f\n", tmp); + } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + // out[out_offset] = static_cast(tmp); + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu index 530a90713..61a430d2c 100644 --- a/src/infiniop/ops/inner/moore/inner_moore.mu +++ b/src/infiniop/ops/inner/moore/inner_moore.mu @@ -3,7 +3,7 @@ #include "../../../devices/moore/moore_kernel_common.h" -#include "inner_kernel.h" +#include "inner_moore_kernel.h" namespace op::inner::moore { diff --git a/src/infiniop/ops/inner/moore/inner_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h similarity index 95% rename from src/infiniop/ops/inner/moore/inner_kernel.h rename to src/infiniop/ops/inner/moore/inner_moore_kernel.h index a9c4150ce..de82dc8a9 100644 --- a/src/infiniop/ops/inner/moore/inner_kernel.h +++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h @@ -1,5 +1,5 @@ -#ifndef __INNER_KERNEL_H__ -#define __INNER_KERNEL_H__ +#ifndef __INNER_MOORE_KERNEL_H__ +#define __INNER_MOORE_KERNEL_H__ template INFINIOP_MOORE_KERNEL innerKernel( diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc index 0d5736d16..c68762d0e 100644 --- a/src/infiniop/ops/inner/operator.cc +++ b/src/infiniop/ops/inner/operator.cc @@ -8,18 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/inner_nvidia.cuh" #endif -// #ifdef ENABLE_METAX_API -// #include "metax/inner_metax.h" -// #endif -// #ifdef ENABLE_ASCEND_API -// #include "ascend/inner_ascend.h" -// #endif -// #ifdef ENABLE_CAMBRICON_API -// #include "bang/inner_bang.h" -// #endif -// #ifdef ENABLE_KUNLUN_API -// #include "kunlun/inner_kunlun.h" -// #endif +#ifdef ENABLE_METAX_API +#include "metax/inner_metax.h" +#endif #ifdef ENABLE_MOORE_API #include "moore/inner_moore.h" #endif @@ -47,27 +38,12 @@ __C infiniStatus_t infiniopCreateInnerDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// CREATE(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// CREATE(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// CREATE(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// CREATE(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// CREATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// CREATE(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore) #endif @@ -89,27 +65,12 @@ __C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// GET(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// GET(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// GET(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_METAX_API -// GET(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// GET(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_API -// GET(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_KUNLUN_API -// GET(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore) #endif @@ -137,27 +98,12 @@ __C infiniStatus_t infiniopInner( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// CALCULATE(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// CALCULATE(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// CALCULATE(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// CALCULATE(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// CALCULATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore) #endif @@ -179,27 +125,12 @@ __C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// DESTROY(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// DESTROY(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// DESTROY(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// DESTROY(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// DESTROY(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// DESTROY(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API DESTROY(INFINI_DEVICE_MOORE, moore) #endif diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h new file mode 100644 index 000000000..47460777f --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_METAX_H__ +#define __MASKED_SELECT_METAX_H__ +#include "../masked_select.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca new file mode 100644 index 000000000..4a4181153 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca @@ -0,0 +1,154 @@ +#include "../../../devices/metax/metax_common.h" +#include "masked_select_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_metax_kernel.h" + +namespace op::masked_select::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(shape_hc, info.shape.data(), ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(mask_strides_hc, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_METAX(hcDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_METAX(hcDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream)); + + CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(METAX); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_hc, input_strides_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h new file mode 100644 index 000000000..e590de955 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_METAX_KERNEL_CUH__ +#define __MASKED_SELECT_METAX_KERNEL_CUH__ + +template +INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_METAX_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu index 4434ce187..3c218bc19 100644 --- a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu @@ -4,7 +4,7 @@ #include "../../../devices/moore/moore_kernel_common.h" // #include "infinicore/context/context.hpp" -#include "masked_select_kernel.h" +#include "masked_select_moore_kernel.h" namespace op::masked_select::moore { diff --git a/src/infiniop/ops/masked_select/moore/masked_select_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h similarity index 97% rename from src/infiniop/ops/masked_select/moore/masked_select_kernel.h rename to src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h index 901cee34a..dcdd87c6d 100644 --- a/src/infiniop/ops/masked_select/moore/masked_select_kernel.h +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h @@ -1,5 +1,5 @@ -#ifndef __MASKED_SELECT_KERNEL_CUH__ -#define __MASKED_SELECT_KERNEL_CUH__ +#ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__ +#define __MASKED_SELECT_MOORE_KERNEL_CUH__ template INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc index 5afe0f106..055b67b12 100644 --- a/src/infiniop/ops/masked_select/operator.cc +++ b/src/infiniop/ops/masked_select/operator.cc @@ -8,18 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) #include "nvidia/masked_select_nvidia.cuh" #endif -// #ifdef ENABLE_METAX_API -// #include "metax/masked_select_metax.h" -// #endif -// #ifdef ENABLE_ASCEND_API -// #include "ascend/masked_select_ascend.h" -// #endif -// #ifdef ENABLE_CAMBRICON_API -// #include "bang/masked_select_bang.h" -// #endif -// #ifdef ENABLE_KUNLUN_API -// #include "kunlun/masked_select_kunlun.h" -// #endif +#ifdef ENABLE_METAX_API +#include "metax/masked_select_metax.h" +#endif #ifdef ENABLE_MOORE_API #include "moore/masked_select_moore.h" #endif @@ -45,27 +36,12 @@ __C infiniStatus_t infiniopCreateMaskedSelectDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// CREATE(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// CREATE(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// CREATE(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// CREATE(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// CREATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// CREATE(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore) #endif @@ -87,27 +63,12 @@ __C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDesc #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// GET(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// GET(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// GET(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_METAX_API -// GET(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// GET(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_API -// GET(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_KUNLUN_API -// GET(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore) #endif @@ -136,27 +97,12 @@ __C infiniStatus_t infiniopMaskedSelect( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// CALCULATE(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// CALCULATE(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// CALCULATE(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// CALCULATE(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// CALCULATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore) #endif @@ -178,27 +124,12 @@ __C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDes #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -// #ifdef ENABLE_ILUVATAR_API -// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif -// #ifdef ENABLE_QY_API -// DESTROY(INFINI_DEVICE_QY, nvidia); -// #endif -// #ifdef ENABLE_HYGON_API -// DESTROY(INFINI_DEVICE_HYGON, nvidia); -// #endif -// #ifdef ENABLE_CAMBRICON_API -// DESTROY(INFINI_DEVICE_CAMBRICON, bang) -// #endif -// #ifdef ENABLE_METAX_API -// DESTROY(INFINI_DEVICE_METAX, metax) -// #endif -// #ifdef ENABLE_ASCEND_API -// DESTROY(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_KUNLUN_API -// DESTROY(INFINI_DEVICE_KUNLUN, kunlun) -// #endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif #ifdef ENABLE_MOORE_API DESTROY(INFINI_DEVICE_MOORE, moore) #endif diff --git a/src/infiniop/ops/tan/metax/tan_metax.h b/src/infiniop/ops/tan/metax/tan_metax.h new file mode 100644 index 000000000..507fd9d5e --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.h @@ -0,0 +1,8 @@ +#ifndef __TAN_METAX_API_H__ +#define __TAN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, metax) + +#endif // __TAN_METAX_API_H__ diff --git a/src/infiniop/ops/tan/metax/tan_metax.maca b/src/infiniop/ops/tan/metax/tan_metax.maca new file mode 100644 index 000000000..85768bec3 --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "tan_metax.h" + +namespace op::tan::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::metax diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu index ab35146ff..4caec16ef 100644 --- a/src/infiniop/ops/tan/moore/tan_moore.mu +++ b/src/infiniop/ops/tan/moore/tan_moore.mu @@ -1,6 +1,6 @@ #include "../../../elementwise/moore/elementwise_moore.h" -#include "tan_moore_kernel.h" +#include "../cuda/kernel.cuh" #include "tan_moore.h" namespace op::tan::moore { @@ -42,11 +42,11 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::TanOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::TanOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index 1cd652c73..84f338fd7 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -8,9 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) #include "nvidia/tan_nvidia.cuh" #endif -// #ifdef ENABLE_METAX_API -// #include "metax/tan_metax.h" -// #endif +#ifdef ENABLE_METAX_API +#include "metax/tan_metax.h" +#endif #ifdef ENABLE_MOORE_API #include "moore/tan_moore.h" #endif @@ -40,9 +40,9 @@ __C infiniStatus_t infiniopCreateTanDescriptor( // #ifdef ENABLE_ILUVATAR_API // CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// CREATE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); #endif @@ -71,9 +71,9 @@ __C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, siz // #ifdef ENABLE_ILUVATAR_API // GET(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// GET(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); #endif @@ -109,9 +109,9 @@ __C infiniStatus_t infiniopTan( // #ifdef ENABLE_ILUVATAR_API // CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// CALCULATE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); #endif @@ -142,9 +142,9 @@ infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { // #ifdef ENABLE_ILUVATAR_API // DELETE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// DELETE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); #endif diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h new file mode 100644 index 000000000..b79edcbf9 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_METAX_API_H__ +#define __TANHSHRINK_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, metax) + +#endif // __TANHSHRINK_METAX_API_H__ diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca new file mode 100644 index 000000000..597cd9006 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "tanhshrink_metax_kernel.h" +#include "tanhshrink_metax.h" + +namespace op::tanhshrink::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, metax::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, metax::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, metax::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::metax diff --git a/src/infiniop/ops/tan/moore/tan_moore_kernel.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h similarity index 56% rename from src/infiniop/ops/tan/moore/tan_moore_kernel.h rename to src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h index ad97cb4f6..03928adde 100644 --- a/src/infiniop/ops/tan/moore/tan_moore_kernel.h +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h @@ -1,9 +1,9 @@ -#ifndef __TAN_MOORE_KERNEL_H__ -#define __TAN_MOORE_KERNEL_H__ +#ifndef __TANHSHRINK_METAX_KERNEL_H__ +#define __TANHSHRINK_METAX_KERNEL_H__ -namespace op::tan::moore { +namespace op::tanhshrink::metax { -typedef struct TanOp { +typedef struct TanhshrinkOp { public: static constexpr size_t num_inputs = 1; template @@ -11,18 +11,18 @@ typedef struct TanOp { if constexpr (std::is_same_v) { // BF16 const float x_f = __bfloat162float(x); - return __float2bfloat16(__tanf(x_f)); + return __float2bfloat16(x_f - tanhf(x_f)); } else if constexpr (std::is_same_v) { // FP16 const float x_f = __half2float(x); - return __float2half(__tanf(x_f)); + return __float2half(x_f - tanhf(x_f)); } else if constexpr (std::is_same_v) { // FP32 - return __tanf(x); + return x - tanhf(x); } } -} TanOp; +} TanhshrinkOp; -} // namespace op::tan::moore +} // namespace op::tanhshrink::metax -#endif // __TAN_MOORE_KERNEL_H__ +#endif // __TANHSHRINK_METAX_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h index b35f88048..9c75b916e 100644 --- a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h @@ -11,7 +11,6 @@ typedef struct TanhshrinkOp { if constexpr (std::is_same_v) { // BF16 const float x_f = __bfloat162float(x); - return __float2bfloat16(x_f - tanhf(x_f)); } else if constexpr (std::is_same_v) { // FP16 diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc index f8c9a722c..7b70f029d 100644 --- a/src/infiniop/ops/tanhshrink/operator.cc +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -8,9 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) #include "nvidia/tanhshrink_nvidia.cuh" #endif -// #ifdef ENABLE_METAX_API -// #include "metax/tanhshrink_metax.h" -// #endif +#ifdef ENABLE_METAX_API +#include "metax/tanhshrink_metax.h" +#endif #ifdef ENABLE_MOORE_API #include "moore/tanhshrink_moore.h" #endif @@ -40,9 +40,9 @@ __C infiniStatus_t infiniopCreateTanhshrinkDescriptor( // #ifdef ENABLE_ILUVATAR_API // CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// CREATE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, moore); #endif @@ -71,9 +71,9 @@ __C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescript // #ifdef ENABLE_ILUVATAR_API // GET(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// GET(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, moore); #endif @@ -109,9 +109,9 @@ __C infiniStatus_t infiniopTanhshrink( // #ifdef ENABLE_ILUVATAR_API // CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// CALCULATE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, moore); #endif @@ -142,9 +142,9 @@ infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { // #ifdef ENABLE_ILUVATAR_API // DELETE(INFINI_DEVICE_ILUVATAR, nvidia); // #endif -// #ifdef ENABLE_METAX_API -// DELETE(INFINI_DEVICE_METAX, metax); -// #endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif #ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, moore); #endif diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index 166fabe33..a66d4f2ef 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -21,7 +21,7 @@ _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, - infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, + infinicore.float32: {"atol": 1e-5, "rtol": 4.7e-4}, infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2}, } From 6dfba296977f20bc1f9f827c5115eab468837a7a Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Thu, 11 Dec 2025 17:15:01 +0800 Subject: [PATCH 08/12] fix: replace __tanhf with tanhf --- src/infiniop/ops/tanhshrink/cuda/kernel.cuh | 6 ++-- .../tanhshrink/metax/tanhshrink_metax.maca | 8 +++--- .../metax/tanhshrink_metax_kernel.h | 28 ------------------- .../ops/tanhshrink/moore/tanhshrink_moore.mu | 8 +++--- 4 files changed, 11 insertions(+), 39 deletions(-) delete mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh index 83ae43c1a..26f507b74 100644 --- a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -11,14 +11,14 @@ public: if constexpr (std::is_same_v) { // BF16 const float x_f = __bfloat162float(x); - return __float2bfloat16(x_f - __tanhf(x_f)); + return __float2bfloat16(x_f - tanhf(x_f)); } else if constexpr (std::is_same_v) { // FP16 const float x_f = __half2float(x); - return __float2half(x_f - __tanhf(x_f)); + return __float2half(x_f - tanhf(x_f)); } else if constexpr (std::is_same_v) { // FP32 - return x - __tanhf(x); + return x - tanhf(x); } } } TanhshrinkOp; diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca index 597cd9006..c63390ab7 100644 --- a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca @@ -1,6 +1,6 @@ #include "../../../elementwise/metax/elementwise_metax.h" -#include "tanhshrink_metax_kernel.h" +#include "../cuda/kernel.cuh" #include "tanhshrink_metax.h" namespace op::tanhshrink::metax { @@ -42,11 +42,11 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, metax::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, metax::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, metax::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h deleted file mode 100644 index 03928adde..000000000 --- a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax_kernel.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __TANHSHRINK_METAX_KERNEL_H__ -#define __TANHSHRINK_METAX_KERNEL_H__ - -namespace op::tanhshrink::metax { - -typedef struct TanhshrinkOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - // BF16 - const float x_f = __bfloat162float(x); - return __float2bfloat16(x_f - tanhf(x_f)); - } else if constexpr (std::is_same_v) { - // FP16 - const float x_f = __half2float(x); - return __float2half(x_f - tanhf(x_f)); - } else if constexpr (std::is_same_v) { - // FP32 - return x - tanhf(x); - } - } -} TanhshrinkOp; - -} // namespace op::tanhshrink::metax - -#endif // __TANHSHRINK_METAX_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu index 37bc31f24..4c4bdee8f 100644 --- a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu @@ -1,6 +1,6 @@ #include "../../../elementwise/moore/elementwise_moore.h" -#include "tanhshrink_moore_kernel.h" +#include "../cuda/kernel.cuh" #include "tanhshrink_moore.h" namespace op::tanhshrink::moore { @@ -42,11 +42,11 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } From b4012bb321ef9e6ec01fab52a2e234c7f031c4ec Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Fri, 12 Dec 2025 14:37:23 +0800 Subject: [PATCH 09/12] fix:fix some bug on iluvatar --- python/infinicore/ops/cat.py | 3 +- src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 2 -- .../nvidia/masked_select_nvidia.cu | 2 -- src/infiniop/ops/tan/cuda/kernel.cuh | 2 ++ src/infiniop/ops/tanhshrink/cuda/kernel.cuh | 2 ++ test/infinicore/framework/base.py | 6 ++-- test/infinicore/ops/cat.py | 32 +++++++++++++------ 7 files changed, 32 insertions(+), 17 deletions(-) diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py index 7bba9822b..3a3e1470f 100644 --- a/python/infinicore/ops/cat.py +++ b/python/infinicore/ops/cat.py @@ -7,5 +7,6 @@ def cat(tensors, dim:int=0, *,out=None): return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim) - + + # raise RuntimeError("breakpointer!!!") return out diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu index 9152405e2..e2b3ed0f9 100644 --- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -61,8 +61,6 @@ infiniStatus_t launchKernel( ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim; workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); - assert(workspace_offset == workspace_size); - CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu index 44e3592ca..eebba0fd3 100644 --- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -68,8 +68,6 @@ infiniStatus_t launchKernel( size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += info.total_elements * sizeof(size_t); - assert(workspace_offset == workspace_size); - CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh index 757372a93..85c9eca27 100644 --- a/src/infiniop/ops/tan/cuda/kernel.cuh +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -19,6 +19,8 @@ public: } else if constexpr (std::is_same_v) { // FP32 return __tanf(x); + } else { + return __tanf(x); } } } TanOp; diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh index 26f507b74..dfb88731e 100644 --- a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -19,6 +19,8 @@ public: } else if constexpr (std::is_same_v) { // FP32 return x - tanhf(x); + } else { + return x - tanhf(x); } } } TanhshrinkOp; diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py index eceb1472d..9e0449a10 100644 --- a/test/infinicore/framework/base.py +++ b/test/infinicore/framework/base.py @@ -353,7 +353,8 @@ def prepare_infinicore_inputs_and_kwargs(self, inputs, kwargs, comparison_target inp, comparison_target == i ) infini_inputs.append(infini_list) - cloned_tensors.append(cloned_list) + if (cloned_list): + cloned_tensors.append(cloned_list) else: infini_inputs.append(inp) @@ -374,7 +375,8 @@ def prepare_infinicore_inputs_and_kwargs(self, inputs, kwargs, comparison_target infini_list, cloned_list = self.prepare_infinicore_list( value, key == "out" ) - cloned_tensors.append(cloned_list) + if cloned_list: + cloned_tensors.append(cloned_list) infini_kwargs[key] = infini_list else: infini_kwargs[key] = value diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index e9b85f2e5..82e47d502 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -86,16 +86,16 @@ def parse_test_cases(): output_spec = TensorSpec.from_tensor(output_shape, output_strides, dtype) # Out-of-place test case - test_cases.append( - TestCase( - inputs=[tuple(input_specs)], - kwargs={"dim": dim}, - output_spec=None, - comparison_target=None, - tolerance=tolerance, - description="Cat - OUT_OF_PLACE", - ) - ) + # test_cases.append( + # TestCase( + # inputs=[tuple(input_specs)], + # kwargs={"dim": dim}, + # output_spec=None, + # comparison_target=None, + # tolerance=tolerance, + # description="Cat - OUT_OF_PLACE", + # ) + # ) # In-place test case if output_supports_inplace: @@ -124,6 +124,18 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): """PyTorch cat implementation""" + def print_info( + tensors: tuple[torch.Tensor, ...] | list[torch.Tensor] | None, + dim, + *, + out: torch.Tensor | None = None, + ): + print(type(tensors)) + print(tensors) + print(dim) + print(out) + + # print_info(*args, **kwargs) return torch.cat(*args, **kwargs) def infinicore_operator(self, *args, **kwargs): From bd50d5737d1e7b5bcf8524bdc7d6ea67a42305ce Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Fri, 12 Dec 2025 15:20:49 +0800 Subject: [PATCH 10/12] fix: Eliminate debugging traces --- test/infinicore/ops/cat.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index 82e47d502..81f0a133d 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -86,16 +86,16 @@ def parse_test_cases(): output_spec = TensorSpec.from_tensor(output_shape, output_strides, dtype) # Out-of-place test case - # test_cases.append( - # TestCase( - # inputs=[tuple(input_specs)], - # kwargs={"dim": dim}, - # output_spec=None, - # comparison_target=None, - # tolerance=tolerance, - # description="Cat - OUT_OF_PLACE", - # ) - # ) + test_cases.append( + TestCase( + inputs=[tuple(input_specs)], + kwargs={"dim": dim}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="Cat - OUT_OF_PLACE", + ) + ) # In-place test case if output_supports_inplace: From f1a93d95a63878de05a2562698860d29079a057c Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Fri, 12 Dec 2025 18:19:25 +0800 Subject: [PATCH 11/12] Added device iluvatar support --- src/infiniop/ops/tan/operator.cc | 24 ++++++++++++------------ src/infiniop/ops/tanhshrink/operator.cc | 24 ++++++++++++------------ test/infinicore/ops/add.py | 3 ++- test/infinicore/ops/cat.py | 23 ++++++++++++----------- test/infinicore/ops/inner.py | 3 ++- test/infinicore/ops/masked_select.py | 3 ++- test/infinicore/ops/tan.py | 3 ++- test/infinicore/ops/tanhshrink.py | 3 ++- 8 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index 84f338fd7..1f222a8d1 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -37,9 +37,9 @@ __C infiniStatus_t infiniopCreateTanDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -68,9 +68,9 @@ __C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, siz #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// GET(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -106,9 +106,9 @@ __C infiniStatus_t infiniopTan( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -139,9 +139,9 @@ infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { #ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc index 7b70f029d..66e62b850 100644 --- a/src/infiniop/ops/tanhshrink/operator.cc +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -37,9 +37,9 @@ __C infiniStatus_t infiniopCreateTanhshrinkDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -68,9 +68,9 @@ __C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescript #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// GET(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -106,9 +106,9 @@ __C infiniStatus_t infiniopTanhshrink( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -139,9 +139,9 @@ infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { #ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif -// #ifdef ENABLE_ILUVATAR_API -// DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -// #endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/test/infinicore/ops/add.py b/test/infinicore/ops/add.py index add647be2..98195fd27 100644 --- a/test/infinicore/ops/add.py +++ b/test/infinicore/ops/add.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import ( BaseOperatorTest, TensorSpec, diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index 82e47d502..ff480571e 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import ( BaseOperatorTest, TensorSpec, @@ -86,16 +87,16 @@ def parse_test_cases(): output_spec = TensorSpec.from_tensor(output_shape, output_strides, dtype) # Out-of-place test case - # test_cases.append( - # TestCase( - # inputs=[tuple(input_specs)], - # kwargs={"dim": dim}, - # output_spec=None, - # comparison_target=None, - # tolerance=tolerance, - # description="Cat - OUT_OF_PLACE", - # ) - # ) + test_cases.append( + TestCase( + inputs=[tuple(input_specs)], + kwargs={"dim": dim}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="Cat - OUT_OF_PLACE", + ) + ) # In-place test case if output_supports_inplace: diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index a66d4f2ef..7e3fb8ef6 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner # Test cases format: (a_shape, b_shape, a_strides_or_None, b_strides_or_None) diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py index 27c28842b..5885b83d4 100644 --- a/test/infinicore/ops/masked_select.py +++ b/test/infinicore/ops/masked_select.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner # Test cases format: (input_shape, input_strides_or_None, mask_shape) diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py index 4c786d114..f1eeb32c2 100644 --- a/test/infinicore/ops/tan.py +++ b/test/infinicore/ops/tan.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import ( BaseOperatorTest, TensorSpec, diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py index f85e36446..4d203fcec 100644 --- a/test/infinicore/ops/tanhshrink.py +++ b/test/infinicore/ops/tanhshrink.py @@ -3,8 +3,9 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -import torch import infinicore +import torch +# import infinicore from framework import ( BaseOperatorTest, TensorSpec, From fc3c30ece567f74bd6c3c6ecf425500fc996b31c Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Fri, 12 Dec 2025 22:51:03 +0800 Subject: [PATCH 12/12] fix: Eliminate test traces second --- test/infinicore/ops/cat.py | 12 ------------ test/infinicore/ops/inner.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index 81f0a133d..e9b85f2e5 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -124,18 +124,6 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): """PyTorch cat implementation""" - def print_info( - tensors: tuple[torch.Tensor, ...] | list[torch.Tensor] | None, - dim, - *, - out: torch.Tensor | None = None, - ): - print(type(tensors)) - print(tensors) - print(dim) - print(out) - - # print_info(*args, **kwargs) return torch.cat(*args, **kwargs) def infinicore_operator(self, *args, **kwargs): diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index a66d4f2ef..166fabe33 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -21,7 +21,7 @@ _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, - infinicore.float32: {"atol": 1e-5, "rtol": 4.7e-4}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2}, }