diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp
new file mode 100644
index 000000000..5fdfdcce8
--- /dev/null
+++ b/include/infinicore/ops/cat.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+Tensor cat(std::vector<Tensor> tensors, int dim);
+void cat_(Tensor out, std::vector<Tensor> tensors, int dim);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp
new file mode 100644
index 000000000..017ddc604
--- /dev/null
+++ b/include/infinicore/ops/inner.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Inner {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor out, Tensor input, Tensor other);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor inner(Tensor input, Tensor other);
+void inner_(Tensor out, Tensor input, Tensor other);
+
+}
\ No newline at end of file
diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp
new file mode 100644
index 000000000..89bbfa49d
--- /dev/null
+++ b/include/infinicore/ops/masked_select.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class MaskedSelect {
+public:
+    using schema = void (*)(Tensor, Tensor, void **, size_t *);
+    static void execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor masked_select(Tensor input, Tensor mask);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp
new file mode 100644
index 000000000..9c2d7a860
--- /dev/null
+++ b/include/infinicore/ops/tan.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Tan {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor tan(Tensor input);
+void tan_(Tensor output, Tensor input);
+
+}
\ No newline at end of file
diff --git a/include/infinicore/ops/tanhshrink.hpp b/include/infinicore/ops/tanhshrink.hpp
new file mode 100644
index 000000000..83673b22c
--- /dev/null
+++ b/include/infinicore/ops/tanhshrink.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Tanhshrink {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor tanhshrink(Tensor input);
+void tanhshrink_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index 92e6f5963..594795429 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -10,9 +10,11 @@
 #include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/inner.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
+#include "infiniop/ops/masked_select.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/random_sample.h"
@@ -26,7 +28,9 @@
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
+#include "infiniop/ops/tanhshrink.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h
new file mode 100644
index 000000000..f484cde9c
--- /dev/null
+++ b/include/infiniop/ops/inner.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_INNER_API_H__
+#define __INFINIOP_INNER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateInnerDescriptor(
+    infiniopHandle_t handle,
+    infiniopInnerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc);
+
+__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopInner(
+    infiniopInnerDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h
new file mode 100644
index 000000000..7adf65415
--- /dev/null
+++ b/include/infiniop/ops/masked_select.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_MASKED_SELECT_API_H__
+#define __INFINIOP_MASKED_SELECT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaskedSelectDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc);
+
+__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMaskedSelect(
+    infiniopMaskedSelectDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
new file mode 100644
index 000000000..2e20c61da
--- /dev/null
+++ b/include/infiniop/ops/tan.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_TAN_API_H__
+#define __INFINIOP_TAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t out_desc);
+
+__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h
new file mode 100644
index 000000000..e136f6523
--- /dev/null
+++ b/include/infiniop/ops/tanhshrink.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANHSHRINK_API_H__
+#define __INFINIOP_TANHSHRINK_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle,
+                                                         infiniopTanhshrinkDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t intput);
+
+__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *intput,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc);
+
+#endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 5c541ec3c..6d1deafe0 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -41,10 +41,14 @@
 )
 from infinicore.ops.add import add
 from infinicore.ops.attention import attention
+from infinicore.ops.cat import cat
+from infinicore.ops.inner import inner
+from infinicore.ops.masked_select import masked_select
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.tan import tan
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -101,10 +105,14 @@
     # Operations.
     "add",
     "attention",
+    "cat",
+    "inner",
+    "masked_select",
     "matmul",
     "mul",
     "narrow",
     "rearrange",
+    "tan",
     "empty",
     "empty_like",
     "from_blob",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..4221bdce5 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,7 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .tanhshrink import tanhshrink
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +18,5 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "tanhshrink"
 ]
diff --git a/python/infinicore/nn/functional/tanhshrink.py b/python/infinicore/nn/functional/tanhshrink.py
new file mode 100644
index 000000000..10d17077d
--- /dev/null
+++ b/python/infinicore/nn/functional/tanhshrink.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def tanhshrink(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.tanhshrink(input._underlying))
+
+    _infinicore.tanhshrink_(out._underlying, input._underlying)
+
+    return out
diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py
new file mode 100644
index 000000000..3a3e1470f
--- /dev/null
+++ b/python/infinicore/ops/cat.py
@@ -0,0 +1,12 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def cat(tensors, dim:int=0, *,out=None):
+    if out is None:
+        return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim))
+
+    _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim)
+    
+    # raise RuntimeError("breakpointer!!!")
+    return out
diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py
new file mode 100644
index 000000000..df87c5b34
--- /dev/null
+++ b/python/infinicore/ops/inner.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def inner(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.inner(input._underlying, other._underlying))
+
+    _infinicore.inner_(out._underlying, input._underlying, other._underlying)
+
+    return out
diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py
new file mode 100644
index 000000000..d1846edfc
--- /dev/null
+++ b/python/infinicore/ops/masked_select.py
@@ -0,0 +1,5 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def masked_select(input, mask):
+    return Tensor(_infinicore.masked_select(input._underlying, mask._underlying))
\ No newline at end of file
diff --git a/python/infinicore/ops/tan.py b/python/infinicore/ops/tan.py
new file mode 100644
index 000000000..da438e6d8
--- /dev/null
+++ b/python/infinicore/ops/tan.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def tan(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.tan(input._underlying))
+
+    _infinicore.tan_(out._underlying, input._underlying)
+
+    return out
diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc
new file mode 100644
index 000000000..908f5f457
--- /dev/null
+++ b/src/infinicore/ops/cat/cat.cc
@@ -0,0 +1,165 @@
+#include "infinicore/ops/cat.hpp"
+#include "infinicore/context/context.hpp"
+#include <cstring>
+#include <stdexcept>
+namespace infinicore::op {
+
+namespace {
+
+class CatInfo {
+    
+    CatInfo() = default;
+
+public:
+
+    int dim;
+    int ndim;
+    size_t tensors_size;
+    
+    std::vector<int> contiguous_dim;
+    std::vector<size_t> copy_size;
+
+    static CatInfo create(Tensor& out, std::vector<Tensor>& tensors, int dim) {
+        
+        int ndim = out->ndim();
+        size_t tensors_size = tensors.size();
+
+        std::vector<int> contiguous_dim(tensors_size, ndim);
+        std::vector<size_t> copy_size(tensors_size, dsize(out->dtype()));
+
+        for (int i = 0; i < tensors_size; i ++) {
+            if (tensors[i]->ndim() == 1) continue;
+            if (tensors[i]->stride(ndim - 1) == 1) {
+                contiguous_dim[i] = ndim - 1;
+                for (int j = ndim - 2; j >= dim; j --) {
+                    if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) {
+                        contiguous_dim[i] = j;
+                    }
+                }
+            }
+            for (int j = contiguous_dim[i]; j < ndim; j ++) {
+                copy_size[i] *= tensors[i]->shape()[j];
+            }
+        }
+
+        return CatInfo{dim, ndim, tensors_size, contiguous_dim, copy_size};
+    }
+};
+
+
+void low_dim_copy(
+    CatInfo &info, Tensor& tensor, Tensor& out, 
+    std::byte *tensor_ptr, std::byte *out_ptr, 
+    int depth, int tensor_pos) {
+
+    if (depth != info.contiguous_dim[tensor_pos]) {
+        std::byte* now_tensor_ptr = tensor_ptr;
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < tensor->shape()[depth]; i ++) {
+        
+            low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos);
+        
+            now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype());
+            now_out_ptr += out->stride(depth) * dsize(out->dtype());
+        }
+    } else {
+        if (out->device().getType() == Device::Type::CPU) {
+
+            std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
+        } else {
+            
+            context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
+        }
+    }
+}
+
+void high_dim_split(
+    CatInfo &info, std::vector<Tensor>& tensors, Tensor& out, 
+    std::vector<std::byte*> tensors_ptr, std::byte* out_ptr, 
+    int depth) {
+    
+    if (depth != info.dim) {
+        std::vector<std::byte*> now_tensors_ptr = tensors_ptr;
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < out->shape()[depth]; i ++) {
+
+            high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1);
+            
+            for (int i = 0; i < info.tensors_size; i ++) {
+                if (tensors[i]->ndim() == 1)    continue;
+                now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype());
+            }
+            now_out_ptr += out->stride(depth) * dsize(out->dtype());
+        }
+    } else {
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < info.tensors_size; i ++) {
+            if (tensors[i]->ndim() == 1)    continue;
+            
+            low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i);
+            
+            now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype());
+        }
+    }
+}
+
+} // namespace
+
+Tensor cat(std::vector<Tensor> tensors, int dim) {
+    assert(tensors.size() >= 2);
+    int ndim = tensors[0]->ndim();
+    assert(-ndim <= dim && dim < ndim);
+    dim = (dim + ndim) % ndim;
+
+    Shape shape = tensors[0]->shape();
+    for (int i = 1; i < tensors.size(); i ++) {
+        assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1);
+        if (tensors[i]->ndim() != ndim) continue;
+        shape[dim] += tensors[i]->shape()[dim];
+    }
+
+    auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device());
+    cat_(out, tensors, dim);
+    return out;
+}
+
+void cat_(Tensor out, std::vector<Tensor> tensors, int dim) {
+    // assert the parameter properties are correct.
+    assert(tensors.size() >= 2);
+    int ndim = out->ndim();
+    assert(-ndim <= dim && dim < ndim);
+    dim = (dim + ndim) % ndim;
+
+    size_t dim_shape = 0;
+    for (auto& tensor : tensors) {
+        assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1);
+        if (tensor->ndim() == 1) {
+            assert(tensor->shape()[0] == 0);
+            continue;
+        }
+        for (int i = 0; i < ndim; i ++) {
+            if (i != dim) {
+                assert(tensor->shape()[i] == out->shape()[i]);
+            } else {
+                dim_shape += tensor->shape()[i];
+            }
+        }
+    }
+    assert(dim_shape == out->shape()[dim]);
+
+    // Get info
+    CatInfo info = CatInfo::create(out, tensors, dim);
+    std::vector<std::byte*> tensors_ptr(tensors.size());
+    for (int i = 0; i < tensors.size(); i ++) {
+        tensors_ptr[i] = tensors[i]->data();
+    }
+    std::byte* out_ptr = out->data();
+
+    high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0);
+
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc
new file mode 100644
index 000000000..271ad101c
--- /dev/null
+++ b/src/infinicore/ops/inner/inner.cc
@@ -0,0 +1,46 @@
+#include <infinicore/ops/inner.hpp>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+
+common::OpDispatcher<Inner::schema> &Inner::dispatcher() {
+    static common::OpDispatcher<Inner::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Inner::execute(Tensor out, Tensor input, Tensor other) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(out, input, other);
+}
+
+Tensor inner(Tensor input, Tensor other) {
+    size_t input_ndim = input->ndim();
+    size_t other_ndim = other->ndim();
+
+    assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]);
+
+    Shape out_shape;
+    for (int i = 0; i < input_ndim - 1; i ++)
+        out_shape.push_back(input->shape()[i]);
+    for (int i = 0; i < other_ndim - 1; i ++)
+        out_shape.push_back(other->shape()[i]);
+    auto out = Tensor::zeros(out_shape, input->dtype(), input->device());
+
+    inner_(out, input, other);
+    return out;
+}
+
+void inner_(Tensor out, Tensor input, Tensor other) {
+    
+    Inner::execute(out, input, other);
+
+}
+
+}
\ No newline at end of file
diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc
new file mode 100644
index 000000000..fe6b8de9a
--- /dev/null
+++ b/src/infinicore/ops/inner/inner_infiniop.cc
@@ -0,0 +1,59 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/inner.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::inner_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopInnerDescriptor_t> caches(
+    100,
+    [](infiniopInnerDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyInnerDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor input, Tensor other) {
+    size_t seed = hash_combine(out, input, other);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopInnerDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor(
+            context::getInfiniopHandle(input->device()), &desc,
+            out->desc(), input->desc(), other->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+
+    INFINICORE_CHECK_ERROR(infiniopInner(
+        desc, workspace->data(), workspace_size,
+        out->data(), input->data(), other->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Inner::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+}
\ No newline at end of file
diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc
new file mode 100644
index 000000000..13ac6dfe4
--- /dev/null
+++ b/src/infinicore/ops/masked_select/masked_select.cc
@@ -0,0 +1,33 @@
+#include <infinicore/ops/masked_select.hpp>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<MaskedSelect::schema> &MaskedSelect::dispatcher() {
+    static common::OpDispatcher<MaskedSelect::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+    
+    if (func == nullptr) {
+        throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(input, mask, data_ptr, dlen_ptr);
+}  
+
+Tensor masked_select(Tensor input, Tensor mask) {
+
+    std::byte *data;
+    size_t dlen;
+    MaskedSelect::execute(input, mask, (void **)&data, &dlen);
+
+    auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device());
+    
+    return out;
+}
+
+}
diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
new file mode 100644
index 000000000..0144ce425
--- /dev/null
+++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
@@ -0,0 +1,59 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/masked_select.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::masked_select_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopMaskedSelectDescriptor_t> caches(
+    100,
+    [](infiniopMaskedSelectDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyMaskedSelectDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
+    size_t seed = hash_combine(input, mask, (std::uintptr_t)data_ptr, (std::uintptr_t)dlen_ptr);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopMaskedSelectDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateMaskedSelectDescriptor(
+            context::getInfiniopHandle(input->device()), &desc,
+            input->desc(), mask->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+
+    INFINICORE_CHECK_ERROR(infiniopMaskedSelect(
+        desc, workspace->data(), workspace_size,
+        input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream()));
+}
+
+static bool registered = []() {
+    MaskedSelect::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+}
diff --git a/src/infinicore/ops/tan/tan.cc b/src/infinicore/ops/tan/tan.cc
new file mode 100644
index 000000000..20bcab4b5
--- /dev/null
+++ b/src/infinicore/ops/tan/tan.cc
@@ -0,0 +1,32 @@
+#include "infinicore/ops/tan.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Tan::schema> &Tan::dispatcher() {
+    static common::OpDispatcher<Tan::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Tan::execute(Tensor output, Tensor input) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Tan implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor tan(Tensor input) {
+    Shape shape = input->shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    tan_(output, input);
+    return output;
+}
+
+void tan_(Tensor output, Tensor input) {
+    Tan::execute(output, input);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc
new file mode 100644
index 000000000..1d7c9c874
--- /dev/null
+++ b/src/infinicore/ops/tan/tan_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/tan.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::tan_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopTanDescriptor_t> caches(
+    100, // capacity
+    [](infiniopTanDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyTanDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopTanDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateTanDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetTanWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopTan(
+        desc, workspace->data(), workspace_size,
+        output->data(), input->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Tan::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::tan_impl::infiniop
diff --git a/src/infinicore/ops/tanhshrink/tanhshrink.cc b/src/infinicore/ops/tanhshrink/tanhshrink.cc
new file mode 100644
index 000000000..ea96e46f4
--- /dev/null
+++ b/src/infinicore/ops/tanhshrink/tanhshrink.cc
@@ -0,0 +1,32 @@
+#include "infinicore/ops/tanhshrink.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Tanhshrink::schema> &Tanhshrink::dispatcher() {
+    static common::OpDispatcher<Tanhshrink::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Tanhshrink::execute(Tensor output, Tensor input) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Tanhshrink implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor tanhshrink(Tensor input) {
+    Shape shape = input->shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    tanhshrink_(output, input);
+    return output;
+}
+
+void tanhshrink_(Tensor output, Tensor input) {
+    Tanhshrink::execute(output, input);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
new file mode 100644
index 000000000..ca440f875
--- /dev/null
+++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/tanhshrink.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::tanhshrink_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopTanhshrinkDescriptor_t> caches(
+    100, // capacity
+    [](infiniopTanhshrinkDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyTanhshrinkDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopTanhshrinkDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateTanhshrinkDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetTanhshrinkWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopTanhshrink(
+        desc, workspace->data(), workspace_size,
+        output->data(), input->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Tanhshrink::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::tanhshrink_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 978defa17..266eb6919 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -4,9 +4,12 @@
 
 #include "ops/add.hpp"
 #include "ops/attention.hpp"
+#include "ops/cat.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
+#include "ops/inner.hpp"
 #include "ops/linear.hpp"
+#include "ops/masked_select.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
 #include "ops/random_sample.hpp"
@@ -15,6 +18,8 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/tan.hpp"
+#include "ops/tanhshrink.hpp"
 
 namespace py = pybind11;
 
@@ -23,15 +28,20 @@ namespace infinicore::ops {
 inline void bind(py::module &m) {
     bind_add(m);
     bind_attention(m);
+    bind_cat(m);
     bind_causal_softmax(m);
+    bind_inner(m);
     bind_random_sample(m);
     bind_linear(m);
+    bind_masked_select(m);
     bind_matmul(m);
     bind_mul(m);
     bind_rearrange(m);
     bind_rms_norm(m);
     bind_silu(m);
     bind_swiglu(m);
+    bind_tan(m);
+    bind_tanhshrink(m);
     bind_rope(m);
     bind_embedding(m);
 }
diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp
new file mode 100644
index 000000000..2ef3392b0
--- /dev/null
+++ b/src/infinicore/pybind11/ops/cat.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/cat.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_cat(py::module &m) {
+
+    m.def("cat",
+          &op::cat,
+          py::arg("tensors"),
+          py::arg("dim") = 0,
+          R"doc(opertor: torch.cat, out-of-place mode)doc");
+          
+    m.def("cat_",
+          &op::cat_,
+          py::arg("out"),
+          py::arg("tensors"),
+          py::arg("dim") = 0,
+          R"doc(opertor: torch.cat, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp
new file mode 100644
index 000000000..16ee91932
--- /dev/null
+++ b/src/infinicore/pybind11/ops/inner.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/inner.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_inner(py::module &m) {
+
+    m.def("inner",
+          &op::inner,
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(opertor: torch.inner, out-of-place mode)doc");
+          
+    m.def("inner_",
+          &op::inner_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(opertor: torch.inner, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp
new file mode 100644
index 000000000..2734590e6
--- /dev/null
+++ b/src/infinicore/pybind11/ops/masked_select.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "infinicore/ops/masked_select.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_masked_select(py::module &m) {
+
+    m.def("masked_select",
+          &op::masked_select,
+          py::arg("input"),
+          py::arg("mask"),
+          R"doc(opertor: torch.masked_select, out-of-place mode)doc");
+          
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/tan.hpp b/src/infinicore/pybind11/ops/tan.hpp
new file mode 100644
index 000000000..acd62e048
--- /dev/null
+++ b/src/infinicore/pybind11/ops/tan.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/tan.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_tan(py::module &m) {
+    m.def("tan",
+          &op::tan,
+          py::arg("input"),
+          R"doc(opertor: torch.tan, out-of-place mode)doc");
+
+    m.def("tan_",
+          &op::tan_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(opertor: torch.tan, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/tanhshrink.hpp b/src/infinicore/pybind11/ops/tanhshrink.hpp
new file mode 100644
index 000000000..3780555f4
--- /dev/null
+++ b/src/infinicore/pybind11/ops/tanhshrink.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/tanhshrink.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_tanhshrink(py::module &m) {
+    m.def("tanhshrink",
+          &op::tanhshrink,
+          py::arg("input"),
+          R"doc(opertor: torch.tanhshrink, out-of-place mode)doc");
+
+    m.def("tanhshrink_",
+          &op::tanhshrink_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(opertor: torch.tanhshrink, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc
new file mode 100644
index 000000000..5915cb3fc
--- /dev/null
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc
@@ -0,0 +1,88 @@
+#include "inner_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::inner::cpu {
+   
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+    
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<typename T>
+infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) {
+#pragma omp parallel for
+    for (size_t out_index = 0; out_index < info->total_elements; out_index ++) {
+        
+        size_t out_dim_pos = info->out_ndim - 1;
+        size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data());
+        size_t index = out_index;
+
+        ptrdiff_t input_offset = 0;
+        ptrdiff_t other_offset = 0;
+        
+        for (int i = (int)info->other_ndim - 2; i >= 0; i --) {
+            other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i];
+            index /= info->out_shape[out_dim_pos --];
+        }
+        for (int i = (int)info->input_ndim - 2; i >= 0; i --) {
+            input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i];
+            index /= info->out_shape[out_dim_pos --];
+        }
+
+        float tmp = 0.;
+        for (size_t i = 0; i < info->oper_len; i ++) {
+            if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                tmp += utils::cast<float>(input[input_offset]) * utils::cast<float>(other[other_offset]);
+            } else {
+                tmp += input[input_offset] * other[other_offset];
+            }
+            input_offset += info->input_strides[info->input_ndim - 1];
+            other_offset += info->other_strides[info->other_ndim - 1];
+        }
+
+        if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+            out[out_offset] = utils::cast<T>(tmp);
+        } else {
+            out[out_offset] = tmp;
+        }
+    }
+
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void* workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out));
+    } else if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out));
+    } else if (_info.dtype == INFINI_DTYPE_F32){
+        CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h
new file mode 100644
index 000000000..5d4774dfa
--- /dev/null
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_CPU_H__
+#define __INNER_CPU_H__
+#include "../inner.h"
+
+DESCRIPTOR(cpu);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh
new file mode 100644
index 000000000..1616a4281
--- /dev/null
+++ b/src/infiniop/ops/inner/cuda/kernel.cuh
@@ -0,0 +1,41 @@
+#ifndef __INNER_KERNEL_CUH__
+#define __INNER_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_CUDA_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    T tmp = 0;
+    for (size_t i = 0; i < oper_len; i ++) {
+        tmp += input[input_offset] * other[other_offset];
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    out[out_offset] = tmp;
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h
new file mode 100644
index 000000000..2484277eb
--- /dev/null
+++ b/src/infiniop/ops/inner/info.h
@@ -0,0 +1,88 @@
+#ifndef __INNER_INFO_H__
+#define __INNER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::inner {
+
+class InnerInfo {
+    InnerInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    
+    size_t input_ndim;
+    size_t other_ndim;
+    size_t out_ndim;
+
+    size_t oper_len;
+    size_t total_elements;
+
+    std::vector<size_t> out_shape;
+
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> other_strides;
+    std::vector<ptrdiff_t> out_strides;
+
+    static utils::Result<InnerInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t other_desc) {
+
+        auto dtype = out_desc->dtype();
+        if (dtype != input_desc->dtype() || dtype != other_desc->dtype())
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+        auto input_ndim = input_desc->ndim();
+        auto other_ndim = other_desc->ndim();
+        auto out_ndim = out_desc->ndim();
+        if (out_ndim + 2 != input_ndim + other_ndim)
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+
+        auto input_shape = input_desc->shape();
+        auto other_shape = other_desc->shape();
+        auto out_shape = out_desc->shape();
+        if (input_ndim && other_ndim) {
+            if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1])
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t out_dim_pos = 0;
+        if (input_ndim) {
+            for (size_t i = 0; i < input_ndim - 1; i ++)
+                if (input_shape[i] != out_shape[out_dim_pos ++])
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (other_ndim) {
+            for (size_t i = 0; i < other_ndim - 1; i ++)
+                if (other_shape[i] != out_shape[out_dim_pos ++])
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t oper_len = 1;
+        if (input_ndim && other_ndim) {
+            oper_len = input_shape[input_ndim - 1];
+        }
+
+        size_t total_elements = 1;
+        for (size_t i = 0; i < out_ndim; i ++)
+            total_elements *= out_shape[i];
+        
+        auto input_strides = input_desc->strides();
+        auto other_strides = other_desc->strides();
+        auto out_strides = out_desc->strides();
+
+        return utils::Result<InnerInfo>(InnerInfo{
+            dtype,
+            input_ndim, other_ndim, out_ndim,
+            oper_len, total_elements, out_shape,
+            input_strides, other_strides, out_strides
+        });
+    }
+};
+
+} // namespace op::inner
+
+#endif // __INNER_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h
new file mode 100644
index 000000000..5c44262a7
--- /dev/null
+++ b/src/infiniop/ops/inner/inner.h
@@ -0,0 +1,49 @@
+#ifndef Inner_H
+#define Inner_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::inner::NAMESPACE {                             \
+    class Descriptor final: public InfiniopDescriptor {          \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        InnerInfo _info;                                         \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            InnerInfo info,                                      \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t input_desc,               \
+            infiniopTensorDescriptor_t other_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *out,                                           \
+            const void *input,                                   \
+            const void *other,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h
new file mode 100644
index 000000000..04a47d445
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_METAX_H__
+#define __INNER_METAX_H__
+#include "../inner.h"
+
+DESCRIPTOR(metax);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca
new file mode 100644
index 000000000..683f4a1e1
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax.maca
@@ -0,0 +1,123 @@
+#include "../../../devices/metax/metax_common.h"
+#include "inner_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "inner_metax_kernel.h"
+
+namespace op::inner::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    hcStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_hc = input_strides_hc + input_ndim;
+    ptrdiff_t *out_strides_hc = other_strides_hc + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_METAX(hcMemcpyAsync(out_shape_hc, info.out_shape.data(), out_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(other_strides_hc, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(out_strides_hc, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_hc,
+        input_strides_hc, other_strides_hc, out_strides_hc,
+        input_ndim, other_ndim, out_ndim);
+
+    // CHECK_METAX(hcDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    hcStream_t stream = (hcStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
new file mode 100644
index 000000000..52f37e1ef
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
@@ -0,0 +1,59 @@
+#ifndef __INNER_METAX_KERNEL_H__
+#define __INNER_METAX_KERNEL_H__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_METAX_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    // T tmp = input[input_offset] * other[other_offset];
+    // input_offset += input_strides[input_ndim - 1];
+    // other_offset += other_strides[other_ndim - 1];
+    T tmp = 0.;
+    for (size_t i = 0; i < oper_len; i ++) {
+        if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            tmp += input[input_offset] * other[other_offset];
+        } else {
+            // if (out_index == 0) {
+            //     printf("input: %.10f\tother:%.10f\n", input[input_offset], other[other_offset]);
+            // }
+            T mul_a = __fmul_rn(input[input_offset], other[other_offset]);
+            // if (out_index == 0) {
+            //     printf("mul answer: %.10f\n", mul_a);
+            //     printf("tmp val: %.10f\n", tmp);
+            // }
+            tmp = __fadd_rd(tmp, mul_a);
+            // if (out_index == 0)
+            //     printf("add answer: %.10f\n", tmp);
+        }
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    // out[out_offset] = static_cast<T>(tmp);
+    out[out_offset] = tmp;
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h
new file mode 100644
index 000000000..a508a6718
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_MOORE_H__
+#define __INNER_MOORE_H__
+#include "../inner.h"
+
+DESCRIPTOR(moore);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu
new file mode 100644
index 000000000..61a430d2c
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore.mu
@@ -0,0 +1,123 @@
+#include "../../../devices/moore/moore_common.h"
+#include "inner_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "inner_moore_kernel.h"
+
+namespace op::inner::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    musaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_musa = input_strides_musa + input_ndim;
+    ptrdiff_t *out_strides_musa = other_strides_musa + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_MOORE(musaMemcpyAsync(out_shape_musa, info.out_shape.data(), out_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(other_strides_musa, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(out_strides_musa, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_musa,
+        input_strides_musa, other_strides_musa, out_strides_musa,
+        input_ndim, other_ndim, out_ndim);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    musaStream_t stream = (musaStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16);  \
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
new file mode 100644
index 000000000..de82dc8a9
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
@@ -0,0 +1,45 @@
+#ifndef __INNER_MOORE_KERNEL_H__
+#define __INNER_MOORE_KERNEL_H__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_MOORE_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    float tmp = 0.;
+    for (size_t i = 0; i < oper_len; i ++) {
+        // if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+        tmp += static_cast<float>(input[input_offset]) * static_cast<float>(other[other_offset]);
+        // } else {
+        //     tmp += input[input_offset] * other[other_offset];
+        // }
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    out[out_offset] = static_cast<T>(tmp);
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
new file mode 100644
index 000000000..e2b3ed0f9
--- /dev/null
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
@@ -0,0 +1,123 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "inner_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::inner::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    cudaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_cuda = input_strides_cuda + input_ndim;
+    ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(out_strides_cuda, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_cuda,
+        input_strides_cuda, other_strides_cuda, out_strides_cuda,
+        input_ndim, other_ndim, out_ndim);
+
+    // CHECK_CUDA(cudaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16);  \
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
new file mode 100644
index 000000000..11033c2c0
--- /dev/null
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __INNER_NVIDIA_H__
+#define __INNER_NVIDIA_H__
+#include "../inner.h"
+
+DESCRIPTOR(nvidia);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc
new file mode 100644
index 000000000..c68762d0e
--- /dev/null
+++ b/src/infiniop/ops/inner/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/inner.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/inner_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/inner_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/inner_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/inner_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateInnerDescriptor(
+    infiniopHandle_t handle,
+    infiniopInnerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::inner::NAMESPACE::Descriptor::create(                            \
+            handle,                                                                 \
+            reinterpret_cast<op::inner::NAMESPACE::Descriptor **>(desc_ptr),        \
+            out_desc,                                                               \
+            input_desc,                                                             \
+            other_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopInner(
+    infiniopInnerDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, out, input, other, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
new file mode 100644
index 000000000..ae74587d8
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
@@ -0,0 +1,58 @@
+#include "masked_select_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::masked_select::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+    
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<typename T>
+infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) {
+    std::vector<T> res;
+    for (size_t index = 0; index < info->total_elements; index ++) {
+        size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data());
+        size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data());
+        if (mask[mask_offset]) {
+            res.push_back(input[input_offset]);
+        }
+    }
+    *dlen_ptr = res.size();
+    *data_ptr = new T[*dlen_ptr];
+    std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(maskedSelect(&_info, (const float *)input, mask, data_ptr, dlen_ptr));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
new file mode 100644
index 000000000..9fed9e2e6
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_CPU_H__
+#define __MASKED_SELECT_CPU_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MASKED_SELECT_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh
new file mode 100644
index 000000000..fc7485f31
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_KERNEL_CUH__
+#define __MASKED_SELECT_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h
new file mode 100644
index 000000000..889b32821
--- /dev/null
+++ b/src/infiniop/ops/masked_select/info.h
@@ -0,0 +1,53 @@
+#ifndef __MASKED_SELECT__INFO_H__
+#define __MASKED_SELECT__INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::masked_select {
+
+class MaskedSelectInfo {
+    MaskedSelectInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t ndim;
+    size_t total_elements;
+
+    std::vector<size_t> shape;
+
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> mask_strides;
+
+    static utils::Result<MaskedSelectInfo> create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) {        
+        auto dtype = input_desc->dtype();
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+        CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL);
+
+        auto shape = input_desc->shape();
+        CHECK_SAME_SHAPE(shape, mask_desc->shape());
+
+        auto ndim = input_desc->ndim();
+
+        size_t total_elements = 1;
+        for (auto& dim : shape) {
+            total_elements *= dim;
+        }
+
+        auto input_strides = input_desc->strides();
+        auto mask_strides = mask_desc->strides();
+
+        return utils::Result<MaskedSelectInfo>(MaskedSelectInfo{
+            dtype,
+            ndim,
+            total_elements,
+            shape,
+            input_strides,
+            mask_strides});
+    }
+};
+
+} // namespace op::masked_select
+
+#endif // __MASKED_SELECT_INFO_H__
diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h
new file mode 100644
index 000000000..98148ebaf
--- /dev/null
+++ b/src/infiniop/ops/masked_select/masked_select.h
@@ -0,0 +1,48 @@
+#ifndef MASKED_SELECT_H
+#define MASKED_SELECT_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::masked_select::NAMESPACE {                     \
+    class Descriptor final: public InfiniopDescriptor {          \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        MaskedSelectInfo _info;                                  \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            MaskedSelectInfo info,                               \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t input_desc,               \
+            infiniopTensorDescriptor_t mask_desc);               \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            const void *input,                                   \
+            const bool *mask,                                    \
+            void **data_ptr,                                     \
+            size_t *dlen_ptr,                                    \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // MASKED_SELECT_H
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
new file mode 100644
index 000000000..47460777f
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_METAX_H__
+#define __MASKED_SELECT_METAX_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(metax);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
new file mode 100644
index 000000000..4a4181153
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
@@ -0,0 +1,154 @@
+#include "../../../devices/metax/metax_common.h"
+#include "masked_select_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+// #include "infinicore/context/context.hpp"
+#include "masked_select_metax_kernel.h"
+
+namespace op::masked_select::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    hcStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_METAX(hcMemcpyAsync(shape_hc, info.shape.data(), ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(mask_strides_hc, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_hc, mask_strides_hc, ndim);
+    CHECK_METAX(hcDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_METAX(hcDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_METAX(hcDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    
+    CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream));
+    
+    CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    // context::setDevice(METAX);
+    // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_hc, input_strides_hc, mask_strides_hc, ndim);
+    CHECK_METAX(hcDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    hcStream_t stream = (hcStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
new file mode 100644
index 000000000..e590de955
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_METAX_KERNEL_CUH__
+#define __MASKED_SELECT_METAX_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_METAX_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
new file mode 100644
index 000000000..24548d8fc
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_MOORE_H__
+#define __MASKED_SELECT_MOORE_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(moore);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
new file mode 100644
index 000000000..3c218bc19
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
@@ -0,0 +1,156 @@
+#include "../../../devices/moore/moore_common.h"
+#include "masked_select_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+// #include "infinicore/context/context.hpp"
+#include "masked_select_moore_kernel.h"
+
+namespace op::masked_select::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    musaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_MOORE(musaMemcpyAsync(shape_musa, info.shape.data(), ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(mask_strides_musa, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_musa, mask_strides_musa, ndim);
+    CHECK_MOORE(musaDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_MOORE(musaDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_MOORE(musaDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    
+    CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream));
+    
+    CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    // context::setDevice(MOORE);
+    // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_musa, input_strides_musa, mask_strides_musa, ndim);
+    CHECK_MOORE(musaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    musaStream_t stream = (musaStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512);
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
new file mode 100644
index 000000000..dcdd87c6d
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__
+#define __MASKED_SELECT_MOORE_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
new file mode 100644
index 000000000..eebba0fd3
--- /dev/null
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
@@ -0,0 +1,149 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "masked_select_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::masked_select::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    cudaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_cuda, mask_strides_cuda, ndim);
+    CHECK_CUDA(cudaDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_CUDA(cudaDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_CUDA(cudaDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
+
+    CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_cuda, input_strides_cuda, mask_strides_cuda, ndim);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512);
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
new file mode 100644
index 000000000..bc493edbe
--- /dev/null
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MASKED_SELECT_NVIDIA_H__
+#define __MASKED_SELECT_NVIDIA_H__
+
+#include "../masked_select.h"
+
+DESCRIPTOR(nvidia)
+
+#endif 
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc
new file mode 100644
index 000000000..055b67b12
--- /dev/null
+++ b/src/infiniop/ops/masked_select/operator.cc
@@ -0,0 +1,138 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/masked_select.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/masked_select_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/masked_select_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/masked_select_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/masked_select_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaskedSelectDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::masked_select::NAMESPACE::Descriptor::create(                    \
+            handle,                                                                 \
+            reinterpret_cast<op::masked_select::NAMESPACE::Descriptor **>(desc_ptr),\
+            input_desc,                                                             \
+            mask_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                          \
+    case CASE:                                                                                        \
+        *size = reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMaskedSelect(
+    infiniopMaskedSelectDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_size,
+    size_t *dlen,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, input, mask, data_size, dlen, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
new file mode 100644
index 000000000..90939f9cb
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -0,0 +1,50 @@
+#include "tan_cpu.h"
+
+namespace op::tan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
new file mode 100644
index 000000000..04004dd19
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __TAN_CPU_H__
+#define __TAN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, cpu)
+
+#include <cmath>
+
+namespace op::tan::cpu {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::tan(x);
+    }
+} TanOp;
+
+} // namespace op::tan::cpu
+
+#endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
new file mode 100644
index 000000000..85c9eca27
--- /dev/null
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __TAN_CUDA_H__
+#define __TAN_CUDA_H__
+
+namespace op::tan::cuda {
+
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(__tanf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(__tanf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return __tanf(x);
+        } else {
+            return __tanf(x);
+        }
+    }
+} TanOp;
+
+} // namespace op::tan::cuda
+
+#endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/metax/tan_metax.h b/src/infiniop/ops/tan/metax/tan_metax.h
new file mode 100644
index 000000000..507fd9d5e
--- /dev/null
+++ b/src/infiniop/ops/tan/metax/tan_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TAN_METAX_API_H__
+#define __TAN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, metax)
+
+#endif // __TAN_METAX_API_H__
diff --git a/src/infiniop/ops/tan/metax/tan_metax.maca b/src/infiniop/ops/tan/metax/tan_metax.maca
new file mode 100644
index 000000000..85768bec3
--- /dev/null
+++ b/src/infiniop/ops/tan/metax/tan_metax.maca
@@ -0,0 +1,56 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "tan_metax.h"
+
+namespace op::tan::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::metax
diff --git a/src/infiniop/ops/tan/moore/tan_moore.h b/src/infiniop/ops/tan/moore/tan_moore.h
new file mode 100644
index 000000000..c261e4a69
--- /dev/null
+++ b/src/infiniop/ops/tan/moore/tan_moore.h
@@ -0,0 +1,8 @@
+#ifndef __TAN_MOORE_API_H__
+#define __TAN_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, moore)
+
+#endif // __TAN_MOORE_API_H__
diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu
new file mode 100644
index 000000000..4caec16ef
--- /dev/null
+++ b/src/infiniop/ops/tan/moore/tan_moore.mu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+#include "tan_moore.h"
+
+namespace op::tan::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::moore
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
new file mode 100644
index 000000000..d5415cb62
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tan_nvidia.cuh"
+
+namespace op::tan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
new file mode 100644
index 000000000..1915cc2b1
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TAN_CUDA_API_H__
+#define __TAN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tan, nvidia)
+
+#endif // __TAN_CUDA_API_H__
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
new file mode 100644
index 000000000..1f222a8d1
--- /dev/null
+++ b/src/infiniop/ops/tan/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tan_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tan_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/tan_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc
new file mode 100644
index 000000000..1abca6a33
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc
@@ -0,0 +1,50 @@
+#include "tanhshrink_cpu.h"
+
+namespace op::tanhshrink::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhshrinkOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhshrinkOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhshrinkOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::cpu
diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h
new file mode 100644
index 000000000..8c5f722ad
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __TANHSHRINK_CPU_H__
+#define __TANHSHRINK_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, cpu)
+
+#include <cmath>
+
+namespace op::tanhshrink::cpu {
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return x - tanh(x);
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::cpu
+
+#endif // __TANHSHRINK_CPU_H__
diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh
new file mode 100644
index 000000000..dfb88731e
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __TANHSHRINK_CUDA_H__
+#define __TANHSHRINK_CUDA_H__
+
+namespace op::tanhshrink::cuda {
+
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return x - tanhf(x);
+        } else {
+            return x - tanhf(x);
+        }
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::cuda
+
+#endif // __TANHSHRINK_CUDA_H__
diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h
new file mode 100644
index 000000000..b79edcbf9
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_METAX_API_H__
+#define __TANHSHRINK_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, metax)
+
+#endif // __TANHSHRINK_METAX_API_H__
diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca
new file mode 100644
index 000000000..c63390ab7
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca
@@ -0,0 +1,56 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_metax.h"
+
+namespace op::tanhshrink::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::metax
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h
new file mode 100644
index 000000000..5a01a76ce
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_MOORE_API_H__
+#define __TANHSHRINK_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, moore)
+
+#endif // __TANHSHRINK_MOORE_API_H__
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu
new file mode 100644
index 000000000..4c4bdee8f
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_moore.h"
+
+namespace op::tanhshrink::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::moore
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
new file mode 100644
index 000000000..9c75b916e
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
@@ -0,0 +1,28 @@
+#ifndef __TANHSHRINK_MOORE_KERNEL_H__
+#define __TANHSHRINK_MOORE_KERNEL_H__
+
+namespace op::tanhshrink::moore {
+
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return x - tanhf(x);
+        }
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::moore
+
+#endif // __TANHSHRINK_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu
new file mode 100644
index 000000000..baff908a2
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_nvidia.cuh"
+
+namespace op::tanhshrink::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::nvidia
diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh
new file mode 100644
index 000000000..abe56f444
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_CUDA_API_H__
+#define __TANHSHRINK_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, nvidia)
+
+#endif // __TANHSHRINK_CUDA_API_H__
diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc
new file mode 100644
index 000000000..66e62b850
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanhshrink.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanhshrink_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanhshrink_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanhshrink_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/tanhshrink_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhshrinkDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhshrinkDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanhshrink::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanhshrink(
+    infiniopTanhshrinkDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py
index 669fbfd1b..e9b85f2e5 100644
--- a/test/infinicore/ops/cat.py
+++ b/test/infinicore/ops/cat.py
@@ -126,9 +126,9 @@ def torch_operator(self, *args, **kwargs):
         """PyTorch cat implementation"""
         return torch.cat(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore cat implementation"""
-    #     return infinicore.cat(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore cat implementation"""
+        return infinicore.cat(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py
index 3f75a6b64..166fabe33 100644
--- a/test/infinicore/ops/inner.py
+++ b/test/infinicore/ops/inner.py
@@ -62,9 +62,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.inner(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.inner(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.inner(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py
index 711a0cef0..27c28842b 100644
--- a/test/infinicore/ops/masked_select.py
+++ b/test/infinicore/ops/masked_select.py
@@ -53,9 +53,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.masked_select(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.masked_select(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.masked_select(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py
index c9948c3c6..4c786d114 100644
--- a/test/infinicore/ops/tan.py
+++ b/test/infinicore/ops/tan.py
@@ -97,9 +97,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.tan(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.tan(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.tan(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py
index ca3559f22..f85e36446 100644
--- a/test/infinicore/ops/tanhshrink.py
+++ b/test/infinicore/ops/tanhshrink.py
@@ -68,9 +68,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.tanhshrink(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.tanhshrink(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.tanhshrink(*args, **kwargs)
 
 
 def main():