diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp new file mode 100644 index 000000000..5fdfdcce8 --- /dev/null +++ b/include/infinicore/ops/cat.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "common/op.hpp" + +namespace infinicore::op { + +Tensor cat(std::vector tensors, int dim); +void cat_(Tensor out, std::vector tensors, int dim); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp new file mode 100644 index 000000000..017ddc604 --- /dev/null +++ b/include/infinicore/ops/inner.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Inner { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor out, Tensor input, Tensor other); + static common::OpDispatcher &dispatcher(); +}; + +Tensor inner(Tensor input, Tensor other); +void inner_(Tensor out, Tensor input, Tensor other); + +} \ No newline at end of file diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp new file mode 100644 index 000000000..89bbfa49d --- /dev/null +++ b/include/infinicore/ops/masked_select.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class MaskedSelect { +public: + using schema = void (*)(Tensor, Tensor, void **, size_t *); + static void execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr); + static common::OpDispatcher &dispatcher(); +}; + +Tensor masked_select(Tensor input, Tensor mask); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp new file mode 100644 index 000000000..9c2d7a860 --- /dev/null +++ b/include/infinicore/ops/tan.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tan { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tan(Tensor input); +void tan_(Tensor output, Tensor input); + +} \ No newline at end of file diff --git a/include/infinicore/ops/tanhshrink.hpp b/include/infinicore/ops/tanhshrink.hpp new file mode 100644 index 000000000..83673b22c --- /dev/null +++ b/include/infinicore/ops/tanhshrink.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tanhshrink { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tanhshrink(Tensor input); +void tanhshrink_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..594795429 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -10,9 +10,11 @@ #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/inner.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/masked_select.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/random_sample.h" @@ -26,7 +28,9 @@ #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" +#include "infiniop/ops/tanhshrink.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h new file mode 100644 index 000000000..f484cde9c --- /dev/null +++ b/include/infiniop/ops/inner.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_INNER_API_H__ +#define __INFINIOP_INNER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t; + +__C __export infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc); + +__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream); + +__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h new file mode 100644 index 000000000..7adf65415 --- /dev/null +++ b/include/infiniop/ops/masked_select.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_MASKED_SELECT_API_H__ +#define __INFINIOP_MASKED_SELECT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc); + +__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, + size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h new file mode 100644 index 000000000..2e20c61da --- /dev/null +++ b/include/infiniop/ops/tan.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_TAN_API_H__ +#define __INFINIOP_TAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc); + +__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h new file mode 100644 index 000000000..e136f6523 --- /dev/null +++ b/include/infiniop/ops/tanhshrink.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANHSHRINK_API_H__ +#define __INFINIOP_TANHSHRINK_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); + +__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5c541ec3c..6d1deafe0 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -41,10 +41,14 @@ ) from infinicore.ops.add import add from infinicore.ops.attention import attention +from infinicore.ops.cat import cat +from infinicore.ops.inner import inner +from infinicore.ops.masked_select import masked_select from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange +from infinicore.ops.tan import tan from infinicore.tensor import ( Tensor, empty, @@ -101,10 +105,14 @@ # Operations. "add", "attention", + "cat", + "inner", + "masked_select", "matmul", "mul", "narrow", "rearrange", + "tan", "empty", "empty_like", "from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..4221bdce5 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,7 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .tanhshrink import tanhshrink __all__ = [ "causal_softmax", @@ -17,4 +18,5 @@ "embedding", "rope", "RopeAlgo", + "tanhshrink" ] diff --git a/python/infinicore/nn/functional/tanhshrink.py b/python/infinicore/nn/functional/tanhshrink.py new file mode 100644 index 000000000..10d17077d --- /dev/null +++ b/python/infinicore/nn/functional/tanhshrink.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tanhshrink(input, *, out=None): + if out is None: + return Tensor(_infinicore.tanhshrink(input._underlying)) + + _infinicore.tanhshrink_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py new file mode 100644 index 000000000..3a3e1470f --- /dev/null +++ b/python/infinicore/ops/cat.py @@ -0,0 +1,12 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def cat(tensors, dim:int=0, *,out=None): + if out is None: + return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) + + _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim) + + # raise RuntimeError("breakpointer!!!") + return out diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py new file mode 100644 index 000000000..df87c5b34 --- /dev/null +++ b/python/infinicore/ops/inner.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def inner(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.inner(input._underlying, other._underlying)) + + _infinicore.inner_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py new file mode 100644 index 000000000..d1846edfc --- /dev/null +++ b/python/infinicore/ops/masked_select.py @@ -0,0 +1,5 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def masked_select(input, mask): + return Tensor(_infinicore.masked_select(input._underlying, mask._underlying)) \ No newline at end of file diff --git a/python/infinicore/ops/tan.py b/python/infinicore/ops/tan.py new file mode 100644 index 000000000..da438e6d8 --- /dev/null +++ b/python/infinicore/ops/tan.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tan(input, *, out=None): + if out is None: + return Tensor(_infinicore.tan(input._underlying)) + + _infinicore.tan_(out._underlying, input._underlying) + + return out diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc new file mode 100644 index 000000000..908f5f457 --- /dev/null +++ b/src/infinicore/ops/cat/cat.cc @@ -0,0 +1,165 @@ +#include "infinicore/ops/cat.hpp" +#include "infinicore/context/context.hpp" +#include +#include +namespace infinicore::op { + +namespace { + +class CatInfo { + + CatInfo() = default; + +public: + + int dim; + int ndim; + size_t tensors_size; + + std::vector contiguous_dim; + std::vector copy_size; + + static CatInfo create(Tensor& out, std::vector& tensors, int dim) { + + int ndim = out->ndim(); + size_t tensors_size = tensors.size(); + + std::vector contiguous_dim(tensors_size, ndim); + std::vector copy_size(tensors_size, dsize(out->dtype())); + + for (int i = 0; i < tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + if (tensors[i]->stride(ndim - 1) == 1) { + contiguous_dim[i] = ndim - 1; + for (int j = ndim - 2; j >= dim; j --) { + if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) { + contiguous_dim[i] = j; + } + } + } + for (int j = contiguous_dim[i]; j < ndim; j ++) { + copy_size[i] *= tensors[i]->shape()[j]; + } + } + + return CatInfo{dim, ndim, tensors_size, contiguous_dim, copy_size}; + } +}; + + +void low_dim_copy( + CatInfo &info, Tensor& tensor, Tensor& out, + std::byte *tensor_ptr, std::byte *out_ptr, + int depth, int tensor_pos) { + + if (depth != info.contiguous_dim[tensor_pos]) { + std::byte* now_tensor_ptr = tensor_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < tensor->shape()[depth]; i ++) { + + low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos); + + now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + if (out->device().getType() == Device::Type::CPU) { + + std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); + } else { + + context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); + } + } +} + +void high_dim_split( + CatInfo &info, std::vector& tensors, Tensor& out, + std::vector tensors_ptr, std::byte* out_ptr, + int depth) { + + if (depth != info.dim) { + std::vector now_tensors_ptr = tensors_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < out->shape()[depth]; i ++) { + + high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1); + + for (int i = 0; i < info.tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); + } + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < info.tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + + low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i); + + now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); + } + } +} + +} // namespace + +Tensor cat(std::vector tensors, int dim) { + assert(tensors.size() >= 2); + int ndim = tensors[0]->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + Shape shape = tensors[0]->shape(); + for (int i = 1; i < tensors.size(); i ++) { + assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1); + if (tensors[i]->ndim() != ndim) continue; + shape[dim] += tensors[i]->shape()[dim]; + } + + auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device()); + cat_(out, tensors, dim); + return out; +} + +void cat_(Tensor out, std::vector tensors, int dim) { + // assert the parameter properties are correct. + assert(tensors.size() >= 2); + int ndim = out->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + size_t dim_shape = 0; + for (auto& tensor : tensors) { + assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1); + if (tensor->ndim() == 1) { + assert(tensor->shape()[0] == 0); + continue; + } + for (int i = 0; i < ndim; i ++) { + if (i != dim) { + assert(tensor->shape()[i] == out->shape()[i]); + } else { + dim_shape += tensor->shape()[i]; + } + } + } + assert(dim_shape == out->shape()[dim]); + + // Get info + CatInfo info = CatInfo::create(out, tensors, dim); + std::vector tensors_ptr(tensors.size()); + for (int i = 0; i < tensors.size(); i ++) { + tensors_ptr[i] = tensors[i]->data(); + } + std::byte* out_ptr = out->data(); + + high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0); + +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc new file mode 100644 index 000000000..271ad101c --- /dev/null +++ b/src/infinicore/ops/inner/inner.cc @@ -0,0 +1,46 @@ +#include +#include + +namespace infinicore::op { + + +common::OpDispatcher &Inner::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Inner::execute(Tensor out, Tensor input, Tensor other) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(out, input, other); +} + +Tensor inner(Tensor input, Tensor other) { + size_t input_ndim = input->ndim(); + size_t other_ndim = other->ndim(); + + assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]); + + Shape out_shape; + for (int i = 0; i < input_ndim - 1; i ++) + out_shape.push_back(input->shape()[i]); + for (int i = 0; i < other_ndim - 1; i ++) + out_shape.push_back(other->shape()[i]); + auto out = Tensor::zeros(out_shape, input->dtype(), input->device()); + + inner_(out, input, other); + return out; +} + +void inner_(Tensor out, Tensor input, Tensor other) { + + Inner::execute(out, input, other); + +} + +} \ No newline at end of file diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc new file mode 100644 index 000000000..fe6b8de9a --- /dev/null +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/inner.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::inner_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopInnerDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyInnerDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor input, Tensor other) { + size_t seed = hash_combine(out, input, other); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopInnerDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor( + context::getInfiniopHandle(input->device()), &desc, + out->desc(), input->desc(), other->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopInner( + desc, workspace->data(), workspace_size, + out->data(), input->data(), other->data(), context::getStream())); +} + +static bool registered = []() { + Inner::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} \ No newline at end of file diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc new file mode 100644 index 000000000..13ac6dfe4 --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select.cc @@ -0,0 +1,33 @@ +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &MaskedSelect::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(input, mask, data_ptr, dlen_ptr); +} + +Tensor masked_select(Tensor input, Tensor mask) { + + std::byte *data; + size_t dlen; + MaskedSelect::execute(input, mask, (void **)&data, &dlen); + + auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device()); + + return out; +} + +} diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc new file mode 100644 index 000000000..0144ce425 --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/masked_select.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::masked_select_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopMaskedSelectDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyMaskedSelectDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + size_t seed = hash_combine(input, mask, (std::uintptr_t)data_ptr, (std::uintptr_t)dlen_ptr); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopMaskedSelectDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateMaskedSelectDescriptor( + context::getInfiniopHandle(input->device()), &desc, + input->desc(), mask->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopMaskedSelect( + desc, workspace->data(), workspace_size, + input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream())); +} + +static bool registered = []() { + MaskedSelect::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} diff --git a/src/infinicore/ops/tan/tan.cc b/src/infinicore/ops/tan/tan.cc new file mode 100644 index 000000000..20bcab4b5 --- /dev/null +++ b/src/infinicore/ops/tan/tan.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tan::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tan::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tan implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tan(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tan_(output, input); + return output; +} + +void tan_(Tensor output, Tensor input) { + Tan::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc new file mode 100644 index 000000000..1d7c9c874 --- /dev/null +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op::tan_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTan( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tan::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} // namespace infinicore::op::tan_impl::infiniop diff --git a/src/infinicore/ops/tanhshrink/tanhshrink.cc b/src/infinicore/ops/tanhshrink/tanhshrink.cc new file mode 100644 index 000000000..ea96e46f4 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tanhshrink::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tanhshrink::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tanhshrink implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tanhshrink(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tanhshrink_(output, input); + return output; +} + +void tanhshrink_(Tensor output, Tensor input) { + Tanhshrink::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc new file mode 100644 index 000000000..ca440f875 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op::tanhshrink_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanhshrinkDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanhshrinkDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanhshrinkDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanhshrinkDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanhshrinkWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTanhshrink( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tanhshrink::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} // namespace infinicore::op::tanhshrink_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..266eb6919 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -4,9 +4,12 @@ #include "ops/add.hpp" #include "ops/attention.hpp" +#include "ops/cat.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" +#include "ops/inner.hpp" #include "ops/linear.hpp" +#include "ops/masked_select.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" #include "ops/random_sample.hpp" @@ -15,6 +18,8 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/tan.hpp" +#include "ops/tanhshrink.hpp" namespace py = pybind11; @@ -23,15 +28,20 @@ namespace infinicore::ops { inline void bind(py::module &m) { bind_add(m); bind_attention(m); + bind_cat(m); bind_causal_softmax(m); + bind_inner(m); bind_random_sample(m); bind_linear(m); + bind_masked_select(m); bind_matmul(m); bind_mul(m); bind_rearrange(m); bind_rms_norm(m); bind_silu(m); bind_swiglu(m); + bind_tan(m); + bind_tanhshrink(m); bind_rope(m); bind_embedding(m); } diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp new file mode 100644 index 000000000..2ef3392b0 --- /dev/null +++ b/src/infinicore/pybind11/ops/cat.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/cat.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cat(py::module &m) { + + m.def("cat", + &op::cat, + py::arg("tensors"), + py::arg("dim") = 0, + R"doc(opertor: torch.cat, out-of-place mode)doc"); + + m.def("cat_", + &op::cat_, + py::arg("out"), + py::arg("tensors"), + py::arg("dim") = 0, + R"doc(opertor: torch.cat, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp new file mode 100644 index 000000000..16ee91932 --- /dev/null +++ b/src/infinicore/pybind11/ops/inner.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/inner.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_inner(py::module &m) { + + m.def("inner", + &op::inner, + py::arg("input"), + py::arg("other"), + R"doc(opertor: torch.inner, out-of-place mode)doc"); + + m.def("inner_", + &op::inner_, + py::arg("out"), + py::arg("input"), + py::arg("other"), + R"doc(opertor: torch.inner, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp new file mode 100644 index 000000000..2734590e6 --- /dev/null +++ b/src/infinicore/pybind11/ops/masked_select.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "infinicore/ops/masked_select.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_masked_select(py::module &m) { + + m.def("masked_select", + &op::masked_select, + py::arg("input"), + py::arg("mask"), + R"doc(opertor: torch.masked_select, out-of-place mode)doc"); + +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/tan.hpp b/src/infinicore/pybind11/ops/tan.hpp new file mode 100644 index 000000000..acd62e048 --- /dev/null +++ b/src/infinicore/pybind11/ops/tan.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tan.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tan(py::module &m) { + m.def("tan", + &op::tan, + py::arg("input"), + R"doc(opertor: torch.tan, out-of-place mode)doc"); + + m.def("tan_", + &op::tan_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tan, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/tanhshrink.hpp b/src/infinicore/pybind11/ops/tanhshrink.hpp new file mode 100644 index 000000000..3780555f4 --- /dev/null +++ b/src/infinicore/pybind11/ops/tanhshrink.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tanhshrink.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tanhshrink(py::module &m) { + m.def("tanhshrink", + &op::tanhshrink, + py::arg("input"), + R"doc(opertor: torch.tanhshrink, out-of-place mode)doc"); + + m.def("tanhshrink_", + &op::tanhshrink_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tanhshrink, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc new file mode 100644 index 000000000..5915cb3fc --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc @@ -0,0 +1,88 @@ +#include "inner_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::inner::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) { +#pragma omp parallel for + for (size_t out_index = 0; out_index < info->total_elements; out_index ++) { + + size_t out_dim_pos = info->out_ndim - 1; + size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data()); + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)info->other_ndim - 2; i >= 0; i --) { + other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + for (int i = (int)info->input_ndim - 2; i >= 0; i --) { + input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < info->oper_len; i ++) { + if constexpr (std::is_same::value || std::is_same::value) { + tmp += utils::cast(input[input_offset]) * utils::cast(other[other_offset]); + } else { + tmp += input[input_offset] * other[other_offset]; + } + input_offset += info->input_strides[info->input_ndim - 1]; + other_offset += info->other_strides[info->other_ndim - 1]; + } + + if constexpr (std::is_same::value || std::is_same::value) { + out[out_offset] = utils::cast(tmp); + } else { + out[out_offset] = tmp; + } + } + + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void* workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F32){ + CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h new file mode 100644 index 000000000..5d4774dfa --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.h @@ -0,0 +1,7 @@ +#ifndef __INNER_CPU_H__ +#define __INNER_CPU_H__ +#include "../inner.h" + +DESCRIPTOR(cpu); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh new file mode 100644 index 000000000..1616a4281 --- /dev/null +++ b/src/infiniop/ops/inner/cuda/kernel.cuh @@ -0,0 +1,41 @@ +#ifndef __INNER_KERNEL_CUH__ +#define __INNER_KERNEL_CUH__ + +template +INFINIOP_CUDA_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + T tmp = 0; + for (size_t i = 0; i < oper_len; i ++) { + tmp += input[input_offset] * other[other_offset]; + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h new file mode 100644 index 000000000..2484277eb --- /dev/null +++ b/src/infiniop/ops/inner/info.h @@ -0,0 +1,88 @@ +#ifndef __INNER_INFO_H__ +#define __INNER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::inner { + +class InnerInfo { + InnerInfo() = default; + +public: + infiniDtype_t dtype; + + size_t input_ndim; + size_t other_ndim; + size_t out_ndim; + + size_t oper_len; + size_t total_elements; + + std::vector out_shape; + + std::vector input_strides; + std::vector other_strides; + std::vector out_strides; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto dtype = out_desc->dtype(); + if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) + return INFINI_STATUS_BAD_TENSOR_DTYPE; + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + auto input_ndim = input_desc->ndim(); + auto other_ndim = other_desc->ndim(); + auto out_ndim = out_desc->ndim(); + if (out_ndim + 2 != input_ndim + other_ndim) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + + auto input_shape = input_desc->shape(); + auto other_shape = other_desc->shape(); + auto out_shape = out_desc->shape(); + if (input_ndim && other_ndim) { + if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t out_dim_pos = 0; + if (input_ndim) { + for (size_t i = 0; i < input_ndim - 1; i ++) + if (input_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (other_ndim) { + for (size_t i = 0; i < other_ndim - 1; i ++) + if (other_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t oper_len = 1; + if (input_ndim && other_ndim) { + oper_len = input_shape[input_ndim - 1]; + } + + size_t total_elements = 1; + for (size_t i = 0; i < out_ndim; i ++) + total_elements *= out_shape[i]; + + auto input_strides = input_desc->strides(); + auto other_strides = other_desc->strides(); + auto out_strides = out_desc->strides(); + + return utils::Result(InnerInfo{ + dtype, + input_ndim, other_ndim, out_ndim, + oper_len, total_elements, out_shape, + input_strides, other_strides, out_strides + }); + } +}; + +} // namespace op::inner + +#endif // __INNER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h new file mode 100644 index 000000000..5c44262a7 --- /dev/null +++ b/src/infiniop/ops/inner/inner.h @@ -0,0 +1,49 @@ +#ifndef Inner_H +#define Inner_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::inner::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + InnerInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + InnerInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t other_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *out, \ + const void *input, \ + const void *other, \ + void *stream) const; \ + }; \ + } + + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h new file mode 100644 index 000000000..04a47d445 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.h @@ -0,0 +1,7 @@ +#ifndef __INNER_METAX_H__ +#define __INNER_METAX_H__ +#include "../inner.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca new file mode 100644 index 000000000..683f4a1e1 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.maca @@ -0,0 +1,123 @@ +#include "../../../devices/metax/metax_common.h" +#include "inner_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +#include "inner_metax_kernel.h" + +namespace op::inner::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_hc = input_strides_hc + input_ndim; + ptrdiff_t *out_strides_hc = other_strides_hc + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(out_shape_hc, info.out_shape.data(), out_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(other_strides_hc, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(out_strides_hc, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_hc, + input_strides_hc, other_strides_hc, out_strides_hc, + input_ndim, other_ndim, out_ndim); + + // CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h new file mode 100644 index 000000000..52f37e1ef --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h @@ -0,0 +1,59 @@ +#ifndef __INNER_METAX_KERNEL_H__ +#define __INNER_METAX_KERNEL_H__ + +template +INFINIOP_METAX_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + // T tmp = input[input_offset] * other[other_offset]; + // input_offset += input_strides[input_ndim - 1]; + // other_offset += other_strides[other_ndim - 1]; + T tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + if constexpr (std::is_same_v || std::is_same_v) { + tmp += input[input_offset] * other[other_offset]; + } else { + // if (out_index == 0) { + // printf("input: %.10f\tother:%.10f\n", input[input_offset], other[other_offset]); + // } + T mul_a = __fmul_rn(input[input_offset], other[other_offset]); + // if (out_index == 0) { + // printf("mul answer: %.10f\n", mul_a); + // printf("tmp val: %.10f\n", tmp); + // } + tmp = __fadd_rd(tmp, mul_a); + // if (out_index == 0) + // printf("add answer: %.10f\n", tmp); + } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + // out[out_offset] = static_cast(tmp); + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h new file mode 100644 index 000000000..a508a6718 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.h @@ -0,0 +1,7 @@ +#ifndef __INNER_MOORE_H__ +#define __INNER_MOORE_H__ +#include "../inner.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu new file mode 100644 index 000000000..61a430d2c --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.mu @@ -0,0 +1,123 @@ +#include "../../../devices/moore/moore_common.h" +#include "inner_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "inner_moore_kernel.h" + +namespace op::inner::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_musa = input_strides_musa + input_ndim; + ptrdiff_t *out_strides_musa = other_strides_musa + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(out_shape_musa, info.out_shape.data(), out_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(other_strides_musa, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(out_strides_musa, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_musa, + input_strides_musa, other_strides_musa, out_strides_musa, + input_ndim, other_ndim, out_ndim); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h new file mode 100644 index 000000000..de82dc8a9 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h @@ -0,0 +1,45 @@ +#ifndef __INNER_MOORE_KERNEL_H__ +#define __INNER_MOORE_KERNEL_H__ + +template +INFINIOP_MOORE_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + // if constexpr (std::is_same_v || std::is_same_v) { + tmp += static_cast(input[input_offset]) * static_cast(other[other_offset]); + // } else { + // tmp += input[input_offset] * other[other_offset]; + // } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = static_cast(tmp); +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu new file mode 100644 index 000000000..e2b3ed0f9 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -0,0 +1,123 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "inner_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::inner::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_cuda = input_strides_cuda + input_ndim; + ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(out_strides_cuda, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_cuda, + input_strides_cuda, other_strides_cuda, out_strides_cuda, + input_ndim, other_ndim, out_ndim); + + // CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh new file mode 100644 index 000000000..11033c2c0 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INNER_NVIDIA_H__ +#define __INNER_NVIDIA_H__ +#include "../inner.h" + +DESCRIPTOR(nvidia); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc new file mode 100644 index 000000000..c68762d0e --- /dev/null +++ b/src/infiniop/ops/inner/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/inner.h" + +#ifdef ENABLE_CPU_API +#include "cpu/inner_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/inner_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/inner_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/inner_moore.h" +#endif + +__C infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::inner::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + input_desc, \ + other_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, out, input, other, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc new file mode 100644 index 000000000..ae74587d8 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc @@ -0,0 +1,58 @@ +#include "masked_select_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::masked_select::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { + std::vector res; + for (size_t index = 0; index < info->total_elements; index ++) { + size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); + size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); + if (mask[mask_offset]) { + res.push_back(input[input_offset]); + } + } + *dlen_ptr = res.size(); + *data_ptr = new T[*dlen_ptr]; + std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr)); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(maskedSelect(&_info, (const float *)input, mask, data_ptr, dlen_ptr)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h new file mode 100644 index 000000000..9fed9e2e6 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_CPU_H__ +#define __MASKED_SELECT_CPU_H__ +#include "../masked_select.h" + +DESCRIPTOR(cpu) + +#endif // __MASKED_SELECT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh new file mode 100644 index 000000000..fc7485f31 --- /dev/null +++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_KERNEL_CUH__ +#define __MASKED_SELECT_KERNEL_CUH__ + +template +INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h new file mode 100644 index 000000000..889b32821 --- /dev/null +++ b/src/infiniop/ops/masked_select/info.h @@ -0,0 +1,53 @@ +#ifndef __MASKED_SELECT__INFO_H__ +#define __MASKED_SELECT__INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::masked_select { + +class MaskedSelectInfo { + MaskedSelectInfo() = default; + +public: + infiniDtype_t dtype; + size_t ndim; + size_t total_elements; + + std::vector shape; + + std::vector input_strides; + std::vector mask_strides; + + static utils::Result create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL); + + auto shape = input_desc->shape(); + CHECK_SAME_SHAPE(shape, mask_desc->shape()); + + auto ndim = input_desc->ndim(); + + size_t total_elements = 1; + for (auto& dim : shape) { + total_elements *= dim; + } + + auto input_strides = input_desc->strides(); + auto mask_strides = mask_desc->strides(); + + return utils::Result(MaskedSelectInfo{ + dtype, + ndim, + total_elements, + shape, + input_strides, + mask_strides}); + } +}; + +} // namespace op::masked_select + +#endif // __MASKED_SELECT_INFO_H__ diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h new file mode 100644 index 000000000..98148ebaf --- /dev/null +++ b/src/infiniop/ops/masked_select/masked_select.h @@ -0,0 +1,48 @@ +#ifndef MASKED_SELECT_H +#define MASKED_SELECT_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::masked_select::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MaskedSelectInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + MaskedSelectInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t mask_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + const void *input, \ + const bool *mask, \ + void **data_ptr, \ + size_t *dlen_ptr, \ + void *stream) const; \ + }; \ + } + +#endif // MASKED_SELECT_H \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h new file mode 100644 index 000000000..47460777f --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_METAX_H__ +#define __MASKED_SELECT_METAX_H__ +#include "../masked_select.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca new file mode 100644 index 000000000..4a4181153 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca @@ -0,0 +1,154 @@ +#include "../../../devices/metax/metax_common.h" +#include "masked_select_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_metax_kernel.h" + +namespace op::masked_select::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(shape_hc, info.shape.data(), ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(mask_strides_hc, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_METAX(hcDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_METAX(hcDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream)); + + CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(METAX); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_hc, input_strides_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h new file mode 100644 index 000000000..e590de955 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_METAX_KERNEL_CUH__ +#define __MASKED_SELECT_METAX_KERNEL_CUH__ + +template +INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_METAX_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h new file mode 100644 index 000000000..24548d8fc --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_MOORE_H__ +#define __MASKED_SELECT_MOORE_H__ +#include "../masked_select.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu new file mode 100644 index 000000000..3c218bc19 --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu @@ -0,0 +1,156 @@ +#include "../../../devices/moore/moore_common.h" +#include "masked_select_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_moore_kernel.h" + +namespace op::masked_select::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(shape_musa, info.shape.data(), ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(mask_strides_musa, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_MOORE(musaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_MOORE(musaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream)); + + CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(MOORE); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_musa, input_strides_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h new file mode 100644 index 000000000..dcdd87c6d --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__ +#define __MASKED_SELECT_MOORE_KERNEL_CUH__ + +template +INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu new file mode 100644 index 000000000..eebba0fd3 --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -0,0 +1,149 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "masked_select_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::masked_select::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_CUDA(cudaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_CUDA(cudaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); + + CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_cuda, input_strides_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh new file mode 100644 index 000000000..bc493edbe --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MASKED_SELECT_NVIDIA_H__ +#define __MASKED_SELECT_NVIDIA_H__ + +#include "../masked_select.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc new file mode 100644 index 000000000..055b67b12 --- /dev/null +++ b/src/infiniop/ops/masked_select/operator.cc @@ -0,0 +1,138 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/masked_select.h" + +#ifdef ENABLE_CPU_API +#include "cpu/masked_select_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/masked_select_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/masked_select_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/masked_select_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::masked_select::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr),\ + input_desc, \ + mask_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_size, + size_t *dlen, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, input, mask, data_size, dlen, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc new file mode 100644 index 000000000..90939f9cb --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -0,0 +1,50 @@ +#include "tan_cpu.h" + +namespace op::tan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h new file mode 100644 index 000000000..04004dd19 --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TAN_CPU_H__ +#define __TAN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tan, cpu) + +#include + +namespace op::tan::cpu { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::tan(x); + } +} TanOp; + +} // namespace op::tan::cpu + +#endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh new file mode 100644 index 000000000..85c9eca27 --- /dev/null +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TAN_CUDA_H__ +#define __TAN_CUDA_H__ + +namespace op::tan::cuda { + +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return __tanf(x); + } else { + return __tanf(x); + } + } +} TanOp; + +} // namespace op::tan::cuda + +#endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/metax/tan_metax.h b/src/infiniop/ops/tan/metax/tan_metax.h new file mode 100644 index 000000000..507fd9d5e --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.h @@ -0,0 +1,8 @@ +#ifndef __TAN_METAX_API_H__ +#define __TAN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, metax) + +#endif // __TAN_METAX_API_H__ diff --git a/src/infiniop/ops/tan/metax/tan_metax.maca b/src/infiniop/ops/tan/metax/tan_metax.maca new file mode 100644 index 000000000..85768bec3 --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "tan_metax.h" + +namespace op::tan::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::metax diff --git a/src/infiniop/ops/tan/moore/tan_moore.h b/src/infiniop/ops/tan/moore/tan_moore.h new file mode 100644 index 000000000..c261e4a69 --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.h @@ -0,0 +1,8 @@ +#ifndef __TAN_MOORE_API_H__ +#define __TAN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, moore) + +#endif // __TAN_MOORE_API_H__ diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu new file mode 100644 index 000000000..4caec16ef --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" +#include "tan_moore.h" + +namespace op::tan::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::moore diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu new file mode 100644 index 000000000..d5415cb62 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tan_nvidia.cuh" + +namespace op::tan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh new file mode 100644 index 000000000..1915cc2b1 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAN_CUDA_API_H__ +#define __TAN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tan, nvidia) + +#endif // __TAN_CUDA_API_H__ diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc new file mode 100644 index 000000000..1f222a8d1 --- /dev/null +++ b/src/infiniop/ops/tan/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tan_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tan_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/tan_moore.h" +#endif + +__C infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc new file mode 100644 index 000000000..1abca6a33 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc @@ -0,0 +1,50 @@ +#include "tanhshrink_cpu.h" + +namespace op::tanhshrink::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::cpu diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h new file mode 100644 index 000000000..8c5f722ad --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TANHSHRINK_CPU_H__ +#define __TANHSHRINK_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, cpu) + +#include + +namespace op::tanhshrink::cpu { +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x - tanh(x); + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cpu + +#endif // __TANHSHRINK_CPU_H__ diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh new file mode 100644 index 000000000..dfb88731e --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TANHSHRINK_CUDA_H__ +#define __TANHSHRINK_CUDA_H__ + +namespace op::tanhshrink::cuda { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - tanhf(x); + } else { + return x - tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cuda + +#endif // __TANHSHRINK_CUDA_H__ diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h new file mode 100644 index 000000000..b79edcbf9 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_METAX_API_H__ +#define __TANHSHRINK_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, metax) + +#endif // __TANHSHRINK_METAX_API_H__ diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca new file mode 100644 index 000000000..c63390ab7 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_metax.h" + +namespace op::tanhshrink::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::metax diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h new file mode 100644 index 000000000..5a01a76ce --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_MOORE_API_H__ +#define __TANHSHRINK_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, moore) + +#endif // __TANHSHRINK_MOORE_API_H__ diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu new file mode 100644 index 000000000..4c4bdee8f --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_moore.h" + +namespace op::tanhshrink::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::moore diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h new file mode 100644 index 000000000..9c75b916e --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h @@ -0,0 +1,28 @@ +#ifndef __TANHSHRINK_MOORE_KERNEL_H__ +#define __TANHSHRINK_MOORE_KERNEL_H__ + +namespace op::tanhshrink::moore { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::moore + +#endif // __TANHSHRINK_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu new file mode 100644 index 000000000..baff908a2 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_nvidia.cuh" + +namespace op::tanhshrink::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhshrinkOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::nvidia diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh new file mode 100644 index 000000000..abe56f444 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_CUDA_API_H__ +#define __TANHSHRINK_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, nvidia) + +#endif // __TANHSHRINK_CUDA_API_H__ diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc new file mode 100644 index 000000000..66e62b850 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanhshrink.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanhshrink_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanhshrink_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanhshrink_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/tanhshrink_moore.h" +#endif + +__C infiniStatus_t infiniopCreateTanhshrinkDescriptor( + infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanhshrink::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanhshrink( + infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index 669fbfd1b..e9b85f2e5 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -126,9 +126,9 @@ def torch_operator(self, *args, **kwargs): """PyTorch cat implementation""" return torch.cat(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore cat implementation""" - # return infinicore.cat(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore cat implementation""" + return infinicore.cat(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index 3f75a6b64..166fabe33 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -62,9 +62,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.inner(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.inner(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.inner(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py index 711a0cef0..27c28842b 100644 --- a/test/infinicore/ops/masked_select.py +++ b/test/infinicore/ops/masked_select.py @@ -53,9 +53,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.masked_select(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.masked_select(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.masked_select(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py index c9948c3c6..4c786d114 100644 --- a/test/infinicore/ops/tan.py +++ b/test/infinicore/ops/tan.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.tan(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.tan(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.tan(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py index ca3559f22..f85e36446 100644 --- a/test/infinicore/ops/tanhshrink.py +++ b/test/infinicore/ops/tanhshrink.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.tanhshrink(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.tanhshrink(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.tanhshrink(*args, **kwargs) def main():