diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index 5274dde48..e3ca88797 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -5,12 +5,16 @@
 #include "ops/add_rms_norm.hpp"
 #include "ops/asinh.hpp"
 #include "ops/attention.hpp"
+#include "ops/avg_pool1d.hpp"
 #include "ops/baddbmm.hpp"
 #include "ops/bilinear.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
 #include "ops/fmod.hpp"
+#include "ops/hardswish.hpp"
+#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
diff --git a/include/infinicore/ops/avg_pool1d.hpp b/include/infinicore/ops/avg_pool1d.hpp
new file mode 100644
index 000000000..4bf69bc2a
--- /dev/null
+++ b/include/infinicore/ops/avg_pool1d.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class AvgPool1d {
+public:
+    using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t);
+    static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);
+void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp
new file mode 100644
index 000000000..958ee1089
--- /dev/null
+++ b/include/infinicore/ops/cross_entropy.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class CrossEntropy {
+public:
+    // Schema 定义：函数指针类型
+    // CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels)
+    using schema = void (*)(Tensor, Tensor, Tensor);
+
+    // 执行入口
+    static void execute(Tensor output, Tensor input, Tensor target);
+
+    // 分发器访问接口
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// ==================================================================
+// 对外 Functional API
+// ==================================================================
+
+// 1. Out-of-place 接口：
+// 输入 Logits 和 Target，内部自动创建 Output Tensor 并返回
+Tensor cross_entropy(Tensor input, Tensor target);
+
+// 2. Explicit Output 接口 (类似于 In-place 风格)：
+// 用户显式提供 Output Tensor 用于存储结果
+// 注意：虽然命名带有下划线 _，但通常 CrossEntropy 无法真正原地修改 input，
+// 所以这里只是表示“写入指定的 output 内存”
+void cross_entropy_(Tensor output, Tensor input, Tensor target);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/equal.hpp b/include/infinicore/ops/equal.hpp
new file mode 100644
index 000000000..1a158bf1e
--- /dev/null
+++ b/include/infinicore/ops/equal.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Equal {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+
+    static void execute(Tensor out, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor equal(Tensor a, Tensor b);
+void equal_(Tensor out, Tensor a, Tensor b);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/hardswish.hpp b/include/infinicore/ops/hardswish.hpp
new file mode 100644
index 000000000..15313f461
--- /dev/null
+++ b/include/infinicore/ops/hardswish.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Hardswish {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor hardswish(Tensor input);
+void hardswish_(Tensor output, Tensor input);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/hardtanh.hpp b/include/infinicore/ops/hardtanh.hpp
new file mode 100644
index 000000000..511408fee
--- /dev/null
+++ b/include/infinicore/ops/hardtanh.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class HardTanh {
+public:
+    using schema = void (*)(Tensor, Tensor, float, float);
+    static void execute(Tensor output, Tensor input, float min_val, float max_val);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f);
+void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f);
+
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index 4217183f7..a73bd20a0 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -45,4 +45,10 @@
 #include "infiniop/ops/zeros.h"
 #include "infiniop/tensor_descriptor.h"
 
+#include "infiniop/ops/cross_entropy.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/avg_pool1d.h"
+#include "infiniop/ops/equal.h"
+#include "infiniop/ops/hardtanh.h"
+
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h
new file mode 100644
index 000000000..5c0bdf6ea
--- /dev/null
+++ b/include/infiniop/ops/avg_pool1d.h
@@ -0,0 +1,32 @@
+#ifndef __INFINIOP_AVG_POOL1D_API_H__
+#define __INFINIOP_AVG_POOL1D_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding);
+
+__INFINI_C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(
+    infiniopAvgPool1dDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopAvgPool1d(
+    infiniopAvgPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor(
+    infiniopAvgPool1dDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h
new file mode 100644
index 000000000..6c9c2a773
--- /dev/null
+++ b/include/infiniop/ops/cross_entropy.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_CROSS_ENTROPY_API_H__
+#define __INFINIOP_CROSS_ENTROPY_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
+    infiniopCrossEntropyDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopCrossEntropy(
+    infiniopCrossEntropyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
+    infiniopCrossEntropyDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h
new file mode 100644
index 000000000..5476f754c
--- /dev/null
+++ b/include/infiniop/ops/equal.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_EQUAL_API_H__
+#define __INFINIOP_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c,
+    infiniopTensorDescriptor_t a,
+    infiniopTensorDescriptor_t b);
+
+__INFINI_C __export infiniStatus_t infiniopGetEqualWorkspaceSize(
+    infiniopEqualDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopEqual(
+    infiniopEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor(
+    infiniopEqualDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..ba5b43b77
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateHardSwishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardSwishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input);
+
+__INFINI_C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize(
+    infiniopHardSwishDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopHardSwish(
+    infiniopHardSwishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor(
+    infiniopHardSwishDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h
new file mode 100644
index 000000000..62f6435aa
--- /dev/null
+++ b/include/infiniop/ops/hardtanh.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_HARDTANH_API_H__
+#define __INFINIOP_HARDTANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardTanhDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateHardTanhDescriptor(infiniopHandle_t handle,
+                                                                    infiniopHardTanhDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t output,
+                                                                    infiniopTensorDescriptor_t input,
+                                                                    float min_val,
+                                                                    float max_val);
+
+__INFINI_C __export infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc,
+                                                                    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t desc,
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    void *output,
+                                                    const void *input,
+                                                    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 46249178d..6fc3e5c00 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -53,6 +53,8 @@
 from infinicore.ops.attention import attention
 from infinicore.ops.baddbmm import baddbmm
 from infinicore.ops.bilinear import bilinear
+from infinicore.ops.cross_entropy import cross_entropy
+from infinicore.ops.equal import equal
 from infinicore.ops.fmod import fmod
 from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
@@ -132,11 +134,13 @@
     "bilinear",
     "fmod",
     "matmul",
+    "equal",
     "mul",
     "narrow",
     "squeeze",
     "unsqueeze",
     "rearrange",
+    "cross_entropy",
     "empty",
     "empty_like",
     "from_blob",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index f81b61262..a8da2dfca 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -1,7 +1,10 @@
 from .adaptive_max_pool1d import adaptive_max_pool1d
+from .avg_pool1d import avg_pool1d
 from .causal_softmax import causal_softmax
 from .embedding import embedding
 from .flash_attention import flash_attention
+from .hardswish import hardswish
+from .hardtanh import hardtanh
 from .linear import linear
 from .linear_w8a8i8 import linear_w8a8i8
 from .random_sample import random_sample
@@ -22,6 +25,9 @@
     "RopeAlgo",
     "rope",
     "silu",
+    "hardswish",
+    "hardtanh",
+    "avg_pool1d",
     "swiglu",
     "linear_w8a8i8",
     "silu_and_mul",
diff --git a/python/infinicore/nn/functional/avg_pool1d.py b/python/infinicore/nn/functional/avg_pool1d.py
new file mode 100644
index 000000000..0cf4759ad
--- /dev/null
+++ b/python/infinicore/nn/functional/avg_pool1d.py
@@ -0,0 +1,24 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def avg_pool1d(
+    input: Tensor,
+    kernel_size: int,
+    stride: int | None = None,
+    padding: int = 0,
+    *,
+    out=None,
+) -> Tensor:
+    if stride is None:
+        stride = 0
+
+    if out is None:
+        return Tensor(
+            _infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding)
+        )
+
+    _infinicore.avg_pool1d_(
+        out._underlying, input._underlying, kernel_size, stride, padding
+    )
+    return out
diff --git a/python/infinicore/nn/functional/hardswish.py b/python/infinicore/nn/functional/hardswish.py
new file mode 100644
index 000000000..b054b8978
--- /dev/null
+++ b/python/infinicore/nn/functional/hardswish.py
@@ -0,0 +1,28 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor:
+    r"""Apply the Hardswish activation function element-wise."""
+
+    if (
+        infinicore.use_ntops
+        and input.device.type in ("cuda", "musa")
+        and out is None
+        and hasattr(infinicore.ntops.torch, "hardswish")
+    ):
+        try:
+            return infinicore.ntops.torch.hardswish(input, inplace=inplace)
+        except AttributeError:
+            pass
+
+    if inplace:
+        _infinicore.hardswish_(input._underlying, input._underlying)
+        return input
+
+    if out is None:
+        return Tensor(_infinicore.hardswish(input._underlying))
+
+    _infinicore.hardswish_(out._underlying, input._underlying)
+    return out
diff --git a/python/infinicore/nn/functional/hardtanh.py b/python/infinicore/nn/functional/hardtanh.py
new file mode 100644
index 000000000..925de33d6
--- /dev/null
+++ b/python/infinicore/nn/functional/hardtanh.py
@@ -0,0 +1,46 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def hardtanh(
+    input: Tensor,
+    min_val: float = -1.0,
+    max_val: float = 1.0,
+    inplace: bool = False,
+    *,
+    out=None,
+) -> Tensor:
+    """Clamp the input tensor to the range [min_val, max_val]."""
+
+    if min_val > max_val:
+        raise ValueError("min_val must be less than or equal to max_val")
+
+    if (
+        infinicore.use_ntops
+        and input.device.type in ("cuda", "musa")
+        and out is None
+        and hasattr(infinicore.ntops.torch, "hardtanh")
+    ):
+        try:
+            return infinicore.ntops.torch.hardtanh(
+                input, min_val=min_val, max_val=max_val, inplace=inplace
+            )
+        except AttributeError:
+            pass
+
+    if inplace:
+        _infinicore.hardtanh_(
+            input._underlying, input._underlying, float(min_val), float(max_val)
+        )
+        return input
+
+    if out is None:
+        return Tensor(
+            _infinicore.hardtanh(input._underlying, float(min_val), float(max_val))
+        )
+
+    _infinicore.hardtanh_(
+        out._underlying, input._underlying, float(min_val), float(max_val)
+    )
+    return out
diff --git a/python/infinicore/ops/cross_entropy.py b/python/infinicore/ops/cross_entropy.py
new file mode 100644
index 000000000..5b47697b5
--- /dev/null
+++ b/python/infinicore/ops/cross_entropy.py
@@ -0,0 +1,33 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def cross_entropy(
+    logits,
+    target,
+    weight=None,
+    *,
+    ignore_index=None,
+    reduction="none",
+    out=None,
+):
+    """
+    Token-wise cross entropy without reduction. The output tensor has the same
+    shape as target and uses the logits dtype.
+    """
+    if weight is not None:
+        raise NotImplementedError("class weights are not supported yet.")
+    if ignore_index is not None:
+        raise NotImplementedError("ignore_index is not supported yet.")
+    if reduction not in (None, "none"):
+        raise NotImplementedError("Only reduction='none' is implemented.")
+
+    if out is None:
+        return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying))
+
+    _infinicore.cross_entropy_(
+        out._underlying,
+        logits._underlying,
+        target._underlying,
+    )
+    return out
diff --git a/python/infinicore/ops/equal.py b/python/infinicore/ops/equal.py
new file mode 100644
index 000000000..5a656ab30
--- /dev/null
+++ b/python/infinicore/ops/equal.py
@@ -0,0 +1,10 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def equal(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.equal(input._underlying, other._underlying))
+
+    _infinicore.equal_(out._underlying, input._underlying, other._underlying)
+    return out
diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py
index 094b2230e..e0019dc89 100644
--- a/python/infinicore/utils.py
+++ b/python/infinicore/utils.py
@@ -1,9 +1,13 @@
-import ml_dtypes
 import numpy as np
 import torch
 
 import infinicore
 
+try:
+    import ml_dtypes
+except ModuleNotFoundError:
+    ml_dtypes = None
+
 
 def to_torch_dtype(infini_dtype):
     """Convert infinicore data type to PyTorch data type"""
@@ -57,7 +61,9 @@ def numpy_to_infinicore_dtype(numpy_dtype):
         return infinicore.float64
     elif numpy_dtype == np.float16:
         return infinicore.float16
-    elif numpy_dtype == ml_dtypes.bfloat16:
+    elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16:
+        return infinicore.bfloat16
+    elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16:
         return infinicore.bfloat16
     elif numpy_dtype == np.int8:
         return infinicore.int8
@@ -86,6 +92,13 @@ def infinicore_to_numpy_dtype(infini_dtype):
     elif infini_dtype == infinicore.int16:
         return np.int16
     elif infini_dtype == infinicore.bfloat16:
+        if hasattr(np, "bfloat16"):
+            return np.bfloat16
+        if ml_dtypes is None:
+            raise ModuleNotFoundError(
+                "ml_dtypes is required for bfloat16 numpy conversion. "
+                "Please install ml_dtypes."
+            )
         return ml_dtypes.bfloat16
     elif infini_dtype == infinicore.int32:
         return np.int32
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 0bd8bc26d..13b69a013 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -17,12 +17,12 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        #"dequantize_awq.py",
+        # "dequantize_awq.py",
         "gelu.py",
         "gemm.py",
-        #"layer_norm.py",
+        # "layer_norm.py",
         "logsoftmax.py",
-        #"lp_norm.py",
+        # "lp_norm.py",
         "mul.py",
         "ones.py",
         "random_sample.py",
@@ -31,7 +31,7 @@ def run_tests(args):
         "rms_norm.py",
         "rope.py",
         "sigmoid.py",
-        #"softmax.py",
+        # "softmax.py",
         "softplus.py",
         "sub.py",
         "swiglu.py",
@@ -42,6 +42,7 @@ def run_tests(args):
         # "paged_attention.py",
         # "paged_caching.py",
         # "paged_attention_prefill.py"
+        "cross_entropy.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
new file mode 100644
index 000000000..907b25b00
--- /dev/null
+++ b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
@@ -0,0 +1,68 @@
+#include "infinicore/ops/avg_pool1d.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
+    static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void AvgPool1d::execute(
+    Tensor output,
+    Tensor input,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, kernel_size, stride, padding);
+}
+
+Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    const auto &shape = input->shape();
+    if (shape.size() != 3) {
+        throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
+    }
+
+    const size_t n = shape[0];
+    const size_t c = shape[1];
+    const size_t l_in = shape[2];
+
+    if (l_in + 2 * padding < kernel_size) {
+        throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
+    }
+
+    const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
+
+    Shape out_shape = {n, c, out_width};
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    avg_pool1d_(output, input, kernel_size, stride, padding);
+    return output;
+}
+
+void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
+    AvgPool1d::execute(output, input, kernel_size, stride, padding);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
new file mode 100644
index 000000000..df7ebda8d
--- /dev/null
+++ b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
@@ -0,0 +1,69 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/avg_pool1d.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::avg_pool1d_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
+    100,
+    [](infiniopAvgPool1dDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(
+    Tensor output,
+    Tensor input,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    size_t seed = hash_combine(output, input, kernel_size, stride, padding);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAvgPool1dDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            kernel_size,
+            stride,
+            padding));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    AvgPool1d::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::avg_pool1d_impl::infiniop
diff --git a/src/infinicore/ops/cross_entropy/cross_entropy.cc b/src/infinicore/ops/cross_entropy/cross_entropy.cc
new file mode 100644
index 000000000..9804d5377
--- /dev/null
+++ b/src/infinicore/ops/cross_entropy/cross_entropy.cc
@@ -0,0 +1,45 @@
+#include "infinicore/ops/cross_entropy.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<CrossEntropy::schema> &CrossEntropy::dispatcher() {
+    static common::OpDispatcher<CrossEntropy::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) {
+
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target);
+
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, target);
+}
+
+Tensor cross_entropy(Tensor input, Tensor target) {
+
+    Shape shape = target->shape();
+
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+
+    cross_entropy_(output, input, target);
+    return output;
+}
+
+void cross_entropy_(Tensor output, Tensor input, Tensor target) {
+    CrossEntropy::execute(output, input, target);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
new file mode 100644
index 000000000..d02f16da6
--- /dev/null
+++ b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
@@ -0,0 +1,64 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/cross_entropy.hpp"
+
+#include <infiniop.h>
+
+namespace infinicore::op::cross_entropy_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopCrossEntropyDescriptor_t> caches(
+    100,
+    [](infiniopCrossEntropyDescriptor_t &desc) {
+        if (desc != nullptr) {
+
+            INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, Tensor target) {
+
+    size_t seed = hash_combine(output, input, target);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopCrossEntropyDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+
+        INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            target->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size));
+
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopCrossEntropy(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        target->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    CrossEntropy::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::cross_entropy_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/equal/equal.cc b/src/infinicore/ops/equal/equal.cc
new file mode 100644
index 000000000..b6acc4d25
--- /dev/null
+++ b/src/infinicore/ops/equal/equal.cc
@@ -0,0 +1,31 @@
+#include "infinicore/ops/equal.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Equal::schema> &Equal::dispatcher() {
+    static common::OpDispatcher<Equal::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Equal::execute(Tensor out, Tensor a, Tensor b) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b);
+    infinicore::context::setDevice(out->device());
+    dispatcher().lookup(out->device().getType())(out, a, b);
+}
+
+Tensor equal(Tensor a, Tensor b) {
+    auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device());
+    equal_(out, a, b);
+    return out;
+}
+
+void equal_(Tensor out, Tensor a, Tensor b) {
+    if (out->dtype() != DataType::BOOL) {
+        throw std::runtime_error("Equal expects bool output tensor.");
+    }
+    Equal::execute(out, a, b);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/equal/equal_infiniop.cc b/src/infinicore/ops/equal/equal_infiniop.cc
new file mode 100644
index 000000000..1b4e4cffa
--- /dev/null
+++ b/src/infinicore/ops/equal/equal_infiniop.cc
@@ -0,0 +1,57 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/equal.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::equal_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopEqualDescriptor_t> caches(
+    100,
+    [](infiniopEqualDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor a, Tensor b) {
+    size_t seed = hash_combine(out, a, b);
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    infiniopEqualDescriptor_t desc = nullptr;
+    if (auto cached = cache.get(seed)) {
+        desc = *cached;
+    } else {
+        INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            out->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopEqual(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        out->data(),
+        a->data(),
+        b->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Equal::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::equal_impl::infiniop
diff --git a/src/infinicore/ops/hardswish/hardswish.cc b/src/infinicore/ops/hardswish/hardswish.cc
new file mode 100644
index 000000000..ec8db75ff
--- /dev/null
+++ b/src/infinicore/ops/hardswish/hardswish.cc
@@ -0,0 +1,38 @@
+#include "infinicore/ops/hardswish.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Hardswish::schema> &Hardswish::dispatcher() {
+    static common::OpDispatcher<Hardswish::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void Hardswish::execute(Tensor output, Tensor input) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No Hardswish implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor hardswish(Tensor input) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    hardswish_(output, input);
+    return output;
+}
+
+void hardswish_(Tensor output, Tensor input) {
+    Hardswish::execute(output, input);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/hardswish/hardswish_infiniop.cc b/src/infinicore/ops/hardswish/hardswish_infiniop.cc
new file mode 100644
index 000000000..44d4054e8
--- /dev/null
+++ b/src/infinicore/ops/hardswish/hardswish_infiniop.cc
@@ -0,0 +1,61 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/hardswish.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::hardswish_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopHardSwishDescriptor_t> caches(
+    100,
+    [](infiniopHardSwishDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHardSwishDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopHardSwish(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Hardswish::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::hardswish_impl::infiniop
diff --git a/src/infinicore/ops/hardtanh/hardtanh.cc b/src/infinicore/ops/hardtanh/hardtanh.cc
new file mode 100644
index 000000000..5a4df2142
--- /dev/null
+++ b/src/infinicore/ops/hardtanh/hardtanh.cc
@@ -0,0 +1,38 @@
+#include "infinicore/ops/hardtanh.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<HardTanh::schema> &HardTanh::dispatcher() {
+    static common::OpDispatcher<HardTanh::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    infinicore::context::setDevice(output->device());
+
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No HardTanh implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, min_val, max_val);
+}
+
+Tensor hardtanh(Tensor input, float min_val, float max_val) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    hardtanh_(output, input, min_val, max_val);
+    return output;
+}
+
+void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) {
+    HardTanh::execute(output, input, min_val, max_val);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
new file mode 100644
index 000000000..d8af439d8
--- /dev/null
+++ b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
@@ -0,0 +1,63 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/hardtanh.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::hardtanh_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopHardTanhDescriptor_t> caches(
+    100,
+    [](infiniopHardTanhDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, float min_val, float max_val) {
+    size_t seed = hash_combine(output, input, min_val, max_val);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHardTanhDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            min_val,
+            max_val));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopHardTanh(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    HardTanh::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::hardtanh_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 1d0ace555..8a83c02f8 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -7,12 +7,17 @@
 #include "ops/add_rms_norm.hpp"
 #include "ops/asinh.hpp"
 #include "ops/attention.hpp"
+#include "ops/avg_pool1d.hpp"
 #include "ops/baddbmm.hpp"
 #include "ops/bilinear.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
+#include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
 #include "ops/fmod.hpp"
+#include "ops/hardswish.hpp"
+#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
@@ -51,18 +56,23 @@ inline void bind(py::module &m) {
     bind_matmul(m);
     bind_mul(m);
     bind_mha_varlen(m);
+    bind_hardswish(m);
+    bind_hardtanh(m);
     bind_paged_attention(m);
     bind_paged_attention_prefill(m);
     bind_paged_caching(m);
     bind_random_sample(m);
+    bind_cross_entropy(m);
     bind_rearrange(m);
     bind_rms_norm(m);
+    bind_avg_pool1d(m);
     bind_silu(m);
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
     bind_linear_w8a8i8(m);
     bind_silu_and_mul(m);
+    bind_equal(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/avg_pool1d.hpp b/src/infinicore/pybind11/ops/avg_pool1d.hpp
new file mode 100644
index 000000000..32394552a
--- /dev/null
+++ b/src/infinicore/pybind11/ops/avg_pool1d.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <optional>
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/avg_pool1d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_avg_pool1d(py::module &m) {
+    m.def(
+        "avg_pool1d",
+        [](::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
+            return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding);
+        },
+        py::arg("input"),
+        py::arg("kernel_size"),
+        py::arg("stride") = py::none(),
+        py::arg("padding") = 0,
+        R"doc(AvgPool1d out-of-place.)doc");
+
+    m.def(
+        "avg_pool1d_",
+        [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
+            op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding);
+        },
+        py::arg("output"),
+        py::arg("input"),
+        py::arg("kernel_size"),
+        py::arg("stride") = py::none(),
+        py::arg("padding") = 0,
+        R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/cross_entropy.hpp b/src/infinicore/pybind11/ops/cross_entropy.hpp
new file mode 100644
index 000000000..8105642a6
--- /dev/null
+++ b/src/infinicore/pybind11/ops/cross_entropy.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/cross_entropy.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_cross_entropy(py::module &m) {
+    m.def("cross_entropy",
+          &op::cross_entropy,
+          py::arg("logits"),
+          py::arg("target"),
+          R"doc(Token-wise cross entropy loss without reduction.)doc");
+
+    m.def("cross_entropy_",
+          &op::cross_entropy_,
+          py::arg("loss"),
+          py::arg("logits"),
+          py::arg("target"),
+          R"doc(Write cross entropy loss into a provided tensor.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/equal.hpp b/src/infinicore/pybind11/ops/equal.hpp
new file mode 100644
index 000000000..d14a6b61d
--- /dev/null
+++ b/src/infinicore/pybind11/ops/equal.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/equal.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_equal(py::module &m) {
+    m.def("equal",
+          &op::equal,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(Elementwise equality returning a bool tensor.)doc");
+
+    m.def("equal_",
+          &op::equal_,
+          py::arg("out"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place elementwise equality writing into `out`.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/hardswish.hpp b/src/infinicore/pybind11/ops/hardswish.hpp
new file mode 100644
index 000000000..daaccec62
--- /dev/null
+++ b/src/infinicore/pybind11/ops/hardswish.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/hardswish.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_hardswish(py::module &m) {
+    m.def("hardswish",
+          &op::hardswish,
+          py::arg("input"),
+          R"doc(Out-of-place Hardswish activation.)doc");
+
+    m.def("hardswish_",
+          &op::hardswish_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place Hardswish activation.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/hardtanh.hpp b/src/infinicore/pybind11/ops/hardtanh.hpp
new file mode 100644
index 000000000..ff9abb872
--- /dev/null
+++ b/src/infinicore/pybind11/ops/hardtanh.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/hardtanh.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_hardtanh(py::module &m) {
+    m.def("hardtanh",
+          &op::hardtanh,
+          py::arg("input"),
+          py::arg("min_val") = -1.0f,
+          py::arg("max_val") = 1.0f,
+          R"doc(Apply the HardTanh activation.)doc");
+
+    m.def("hardtanh_",
+          &op::hardtanh_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("min_val") = -1.0f,
+          py::arg("max_val") = 1.0f,
+          R"doc(In-place HardTanh activation.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/avg_pool1d/avg_pool1d.h b/src/infiniop/ops/avg_pool1d/avg_pool1d.h
new file mode 100644
index 000000000..fae5f445b
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/avg_pool1d.h
@@ -0,0 +1,103 @@
+#ifndef __AVG_POOL1D_H__
+#define __AVG_POOL1D_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/avg_pool1d.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::avg_pool1d::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AvgPool1dInfo _info;                                     \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AvgPool1dInfo info,                                  \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t kernel_size,                                  \
+            size_t stride,                                       \
+            size_t padding);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class AvgPool1dInfo {
+private:
+    AvgPool1dInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t batch, channels, in_width, out_width;
+    size_t kernel_size, stride, padding;
+
+    ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
+    ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
+
+    static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        size_t kernel_size,
+        size_t stride,
+        size_t padding) {
+
+        CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = y_desc->dtype();
+        CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t batch = x_desc->dim(0);
+        size_t channels = x_desc->dim(1);
+        size_t in_width = x_desc->dim(2);
+
+        CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t padded_len = in_width + 2 * padding;
+
+        CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
+        CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t out_width = expected_out_width;
+
+        return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
+            dtype,
+            batch, channels, in_width, out_width,
+            kernel_size, stride, padding,
+            y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
+            x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
new file mode 100644
index 000000000..67e5b6623
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
@@ -0,0 +1,96 @@
+#include "avg_pool1d_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+
+namespace op::avg_pool1d::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
+                                  T *y,
+                                  const T *x) {
+    const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
+
+#pragma omp parallel for
+    for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
+
+        ptrdiff_t b = bc / info.channels;
+        ptrdiff_t c = bc % info.channels;
+
+        size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
+        size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
+
+        for (size_t ow = 0; ow < info.out_width; ++ow) {
+            size_t y_offset = y_base + ow * info.y_stride_width;
+
+            long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
+            long long end_w = start_w + info.kernel_size;
+
+            long long valid_start = std::max(0LL, start_w);
+            long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
+
+            float sum = 0.0f;
+            for (long long iw = valid_start; iw < valid_end; ++iw) {
+                size_t x_offset = x_base + iw * info.x_stride_width;
+                sum += utils::cast<float>(x[x_offset]);
+            }
+
+            const float avg = sum * inv_kernel;
+            y[y_offset] = utils::cast<T>(avg);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(fp16_t);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(bf16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE
+
+} // namespace op::avg_pool1d::cpu
diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
new file mode 100644
index 000000000..2335733db
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
+#define __INFINIOP_AVG_POOL1D_CPU_H__
+
+#include "../avg_pool1d.h"
+
+DESCRIPTOR(cpu)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
new file mode 100644
index 000000000..36a11acfc
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
@@ -0,0 +1,58 @@
+#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
+#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
+
+template <typename T>
+__device__ void avgPool1dKernel(
+    T *y,
+    const T *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+
+    size_t total_elements = batch * channels * out_width;
+
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+
+        long long start_w = static_cast<long long>(ow * stride) - padding;
+
+        T sum = 0;
+
+        for (size_t k = 0; k < kernel_size; ++k) {
+            long long iw = start_w + k;
+
+            if (iw >= 0 && iw < static_cast<long long>(in_width)) {
+                size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width;
+                sum += x[x_offset];
+            }
+        }
+
+#if defined(ENABLE_ILUVATAR_API)
+        // Iluvatar __half doesn't accept size_t directly.
+        y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
+#else
+        y[y_offset] = sum / static_cast<T>(kernel_size);
+#endif
+    }
+}
+
+#endif
diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
new file mode 100644
index 000000000..576da66de
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
+#define __INFINIOP_AVG_POOL1D_METAX_H__
+
+#include "../avg_pool1d.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INFINIOP_AVG_POOL1D_METAX_H__
diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
new file mode 100644
index 000000000..9b3f15b9a
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
@@ -0,0 +1,170 @@
+#include "../../../devices/metax/metax_common.h"
+#include "avg_pool1d_metax.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include <type_traits>
+
+namespace op::avg_pool1d::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __float2half(static_cast<float>(val));
+    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
+        return __float2bfloat16(static_cast<float>(val));
+    } else {
+        return static_cast<Tdata>(val);
+    }
+}
+
+template <typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+
+    size_t total_elements = batch * channels * out_width;
+    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
+
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+        size_t x_base = b * x_stride_batch + c * x_stride_channel;
+
+        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
+        long long end_w = start_w + static_cast<long long>(kernel_size);
+        long long iw_start = start_w < 0 ? 0 : start_w;
+        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
+
+        Tcompute sum = Tcompute(0);
+        if (iw_start < iw_end) {
+            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
+            for (long long iw = iw_start; iw < iw_end; ++iw) {
+                sum += static_cast<Tcompute>(x[x_offset]);
+                x_offset += x_stride_width;
+            }
+        }
+
+        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
+    }
+}
+
+template <typename Tdata, typename Tcompute>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    Tdata *y,
+    const Tdata *x,
+    hcStream_t stream) {
+
+    size_t total_elements = info.batch * info.channels * info.out_width;
+
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
+        block_size = max_threads_per_block;
+    }
+
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+
+    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE(TDATA, TCOMPUTE) \
+    calculateAvgPool1d<TDATA, TCOMPUTE>( \
+        _info, \
+        _opaque->internal->maxThreadsPerBlock(), \
+        (TDATA *)y, \
+        (const TDATA *)x, \
+        (hcStream_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    (void)workspace;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half, float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16, float);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float, float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double, double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE
+
+} // namespace op::avg_pool1d::metax
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
new file mode 100644
index 000000000..9034d7358
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
@@ -0,0 +1,72 @@
+#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
+#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
+
+#include <type_traits>
+
+namespace op::avg_pool1d::moore {
+
+template <typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __float2half(static_cast<float>(val));
+    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
+        return __float2bfloat16_rn(static_cast<float>(val));
+    } else {
+        return static_cast<Tdata>(val);
+    }
+}
+
+template <typename Tdata, typename Tcompute>
+__device__ void avgPool1dKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+
+    size_t total_elements = batch * channels * out_width;
+    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
+
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+        size_t x_base = b * x_stride_batch + c * x_stride_channel;
+
+        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
+        long long end_w = start_w + static_cast<long long>(kernel_size);
+        long long iw_start = start_w < 0 ? 0 : start_w;
+        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
+
+        Tcompute sum = Tcompute(0);
+        if (iw_start < iw_end) {
+            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
+            for (long long iw = iw_start; iw < iw_end; ++iw) {
+                sum += static_cast<Tcompute>(x[x_offset]);
+                x_offset += x_stride_width;
+            }
+        }
+
+        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
+    }
+}
+
+} // namespace op::avg_pool1d::moore
+
+#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
new file mode 100644
index 000000000..604d06012
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
+#define __INFINIOP_AVG_POOL1D_MOORE_H__
+
+#include "../avg_pool1d.h"
+
+DESCRIPTOR(moore)
+
+#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
new file mode 100644
index 000000000..518d249b9
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
@@ -0,0 +1,135 @@
+#include "../../../devices/moore/moore_common.h"
+#include "avg_pool1d_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "avg_pool1d_kernel.h"
+
+namespace op::avg_pool1d::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata, typename Tcompute>
+INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+
+    avgPool1dKernel<Tdata, Tcompute>(
+        y, x,
+        batch, channels, in_width, out_width,
+        kernel_size, stride, padding,
+        y_stride_batch, y_stride_channel, y_stride_width,
+        x_stride_batch, x_stride_channel, x_stride_width);
+}
+
+template <typename Tdata, typename Tcompute>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    Tdata *y,
+    const Tdata *x,
+    musaStream_t stream) {
+
+    size_t total_elements = info.batch * info.channels * info.out_width;
+
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
+        block_size = max_threads_per_block;
+    }
+
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+
+    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE(TDATA, TCOMPUTE) \
+    calculateAvgPool1d<TDATA, TCOMPUTE>(\
+        _info,\
+        _opaque->internal->maxThreadsPerBlock(),\
+        (TDATA *)y,\
+        (const TDATA *)x,\
+        (musaStream_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    (void)workspace;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half, float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16, float);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float, float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double, double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE
+
+} // namespace op::avg_pool1d::moore
diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
new file mode 100644
index 000000000..202d4b8e9
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
@@ -0,0 +1,126 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "avg_pool1d_nvidia.cuh"
+
+template <typename T>
+__global__ void avgPool1dGlobalKernel(
+    T *y,
+    const T *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+
+    avgPool1dKernel<T>(
+        y, x,
+        batch, channels, in_width, out_width,
+        kernel_size, stride, padding,
+        y_stride_batch, y_stride_channel, y_stride_width,
+        x_stride_batch, x_stride_channel, x_stride_width);
+}
+
+namespace op::avg_pool1d::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    T *y,
+    const T *x,
+    cudaStream_t stream) {
+
+    size_t total_elements = info.batch * info.channels * info.out_width;
+
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < 256) {
+        block_size = max_threads_per_block;
+    }
+
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+
+    avgPool1dGlobalKernel<T><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE(TDATA)                                        \
+    calculateAvgPool1d(_info,                                   \
+                       _opaque->internal->maxThreadsPerBlock(), \
+                       (TDATA *)y,                              \
+                       (const TDATA *)x,                        \
+                       (cudaStream_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE
+
+} // namespace op::avg_pool1d::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
new file mode 100644
index 000000000..629e745d7
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
+#define __INFINIOP_AVG_POOL1D_CUDA_H__
+
+#include "../avg_pool1d.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/avg_pool1d/operator.cc b/src/infiniop/ops/avg_pool1d/operator.cc
new file mode 100644
index 000000000..c3696daa1
--- /dev/null
+++ b/src/infiniop/ops/avg_pool1d/operator.cc
@@ -0,0 +1,225 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/avg_pool1d.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/avg_pool1d_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/avg_pool1d_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/avg_pool1d_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/avg_pool1d_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/avg_pool1d_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/avg_pool1d_kunlun.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/avg_pool1d_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::avg_pool1d::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                                    \
+            x,                                                                    \
+            kernel_size,                                                          \
+            stride,                                                               \
+            padding)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc,
+                                                            size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopAvgPool1d(
+    infiniopAvgPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t
+infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
new file mode 100644
index 000000000..af0ebc623
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
@@ -0,0 +1,99 @@
+#include "cross_entropy_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include <algorithm>
+#include <cmath>
+
+namespace op::cross_entropy::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+
+    info.outer_size = target_desc->numel();
+
+    info.vocab_size = x_desc->shape().back();
+
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+
+    *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T, typename Tidx>
+infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info,
+                                    T *y, const T *x, const void *target) {
+    const Tidx *label = reinterpret_cast<const Tidx *>(target);
+
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) {
+        const T *row = x + i * info->x_stride;
+        Tidx idx = label[i];
+
+        if (idx < 0 || static_cast<size_t>(idx) >= info->vocab_size) {
+            y[i] = utils::cast<T>(0.f);
+            continue;
+        }
+
+        float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1);
+
+        float sum_exp = 0.f;
+        for (size_t j = 0; j < info->vocab_size; ++j) {
+            sum_exp += std::exp(utils::cast<float>(row[j]) - max_val);
+        }
+
+        float log_term = std::log(sum_exp) + max_val;
+        float target_logit = utils::cast<float>(row[idx]);
+        y[i] = utils::cast<T>(log_term - target_logit);
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info,
+                                    T *y, const T *x, const void *target) {
+
+    if (info->target_dtype == INFINI_DTYPE_I32) {
+        return cross_entropy_kernel<T, int32_t>(info, y, x, target);
+    } else if (info->target_dtype == INFINI_DTYPE_I64) {
+        return cross_entropy_kernel<T, int64_t>(info, y, x, target);
+    }
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target);
+    case INFINI_DTYPE_BF16:
+        return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target);
+    case INFINI_DTYPE_F32:
+        return dispatch_target_type(&_info, (float *)y, (const float *)x, target);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::cross_entropy::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
new file mode 100644
index 000000000..7417d1d81
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_CPU_H__
+#define __CROSS_ENTROPY_CPU_H__
+
+#include "../cross_entropy.h"
+
+DESCRIPTOR(cpu)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/cross_entropy.h b/src/infiniop/ops/cross_entropy/cross_entropy.h
new file mode 100644
index 000000000..b502823db
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/cross_entropy.h
@@ -0,0 +1,42 @@
+#ifndef CROSS_ENTROPY_H
+#define CROSS_ENTROPY_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                 \
+    namespace op::cross_entropy::NAMESPACE {                                  \
+    class Descriptor final : public InfiniopDescriptor {                      \
+        struct Opaque;                                                        \
+        Opaque *_opaque;                                                      \
+        CrossEntropyInfo _info;                                               \
+        size_t _workspace_size;                                               \
+                                                                              \
+        Descriptor(Opaque *opaque,                                            \
+                   CrossEntropyInfo info,                                     \
+                   size_t workspace_size,                                     \
+                   infiniDevice_t device_type,                                \
+                   int device_id)                                             \
+            : InfiniopDescriptor{device_type, device_id},                     \
+              _opaque(opaque),                                                \
+              _info(info),                                                    \
+              _workspace_size(workspace_size) {}                              \
+                                                                              \
+    public:                                                                   \
+        ~Descriptor();                                                        \
+        size_t workspaceSize() const { return _workspace_size; }              \
+        static infiniStatus_t create(infiniopHandle_t handle,                 \
+                                     Descriptor **desc_ptr,                   \
+                                     infiniopTensorDescriptor_t y_desc,       \
+                                     infiniopTensorDescriptor_t x_desc,       \
+                                     infiniopTensorDescriptor_t target_desc); \
+        infiniStatus_t calculate(void *workspace,                             \
+                                 size_t workspace_size,                       \
+                                 void *y,                                     \
+                                 const void *x,                               \
+                                 const void *target,                          \
+                                 void *stream) const;                         \
+    };                                                                        \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
new file mode 100644
index 000000000..c048c1233
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
@@ -0,0 +1,80 @@
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../reduce/cuda/reduce.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+
+    Tidx label = target[row_idx];
+
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+    Tcompute max_val = max_val_shared;
+
+    Tcompute thread_sum = 0.0f;
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+
+    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+        thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset);
+    }
+
+    static __shared__ Tcompute shared_sum[32];
+    int lane = threadIdx.x % warpSize;
+    int warp = threadIdx.x / warpSize;
+
+    if (lane == 0) {
+        shared_sum[warp] = thread_sum;
+    }
+    __syncthreads();
+
+    Tcompute block_sum = 0.0f;
+    if (warp == 0) {
+
+        if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) {
+            block_sum = shared_sum[lane];
+        }
+        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+            block_sum += __shfl_down_sync(0xffffffff, block_sum, offset);
+        }
+    }
+
+    if (threadIdx.x == 0) {
+        Tcompute log_term = logf(block_sum) + max_val;
+
+        Tcompute target_logit = 0.0f;
+
+        if (label >= 0 && static_cast<size_t>(label) < vocab_size) {
+            target_logit = static_cast<Tcompute>(x[label]);
+        } else {
+
+            log_term = 0.0f;
+        }
+
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+
+#endif
diff --git a/src/infiniop/ops/cross_entropy/info.h b/src/infiniop/ops/cross_entropy/info.h
new file mode 100644
index 000000000..a83afebb8
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/info.h
@@ -0,0 +1,17 @@
+#ifndef CROSS_ENTROPY_INFO_H
+#define CROSS_ENTROPY_INFO_H
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+#include <cstddef>
+
+struct CrossEntropyInfo {
+    int dtype;
+    int target_dtype;
+    size_t outer_size;
+    size_t vocab_size;
+    ptrdiff_t x_stride;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
new file mode 100644
index 000000000..57bccea91
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_METAX_H__
+#define __CROSS_ENTROPY_METAX_H__
+
+#include "../cross_entropy.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CROSS_ENTROPY_METAX_H__
diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
new file mode 100644
index 000000000..efd791183
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
@@ -0,0 +1,188 @@
+#include "../../../devices/metax/metax_common.h"
+#include "cross_entropy_metax.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include <cmath>
+
+namespace {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+
+    Tidx label = target[row_idx];
+
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+
+    Tcompute max_val = max_val_shared;
+
+    Tcompute thread_sum = Tcompute(0);
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
+
+    if (threadIdx.x == 0) {
+        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
+            y_[row_idx] = static_cast<Tdata>(0.0f);
+            return;
+        }
+        Tcompute log_term = logf(block_sum) + max_val;
+        Tcompute target_logit = static_cast<Tcompute>(x[label]);
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+INFINIOP_METAX_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+
+} // namespace
+
+namespace op::cross_entropy::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+    (void)y_desc;
+
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, hcStream_t stream) {
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream_) const {
+
+    (void)workspace;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+    int max_threads = _opaque->internal->maxThreadsPerBlock();
+
+    if (max_threads >= METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(y, x, target, _info, stream));
+    } else if (max_threads >= METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(y, x, target, _info, stream));
+    } else {
+        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cross_entropy::metax
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
new file mode 100644
index 000000000..6648b0e32
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
@@ -0,0 +1,53 @@
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+
+    Tidx label = target[row_idx];
+
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+
+    Tcompute max_val = max_val_shared;
+
+    Tcompute thread_sum = Tcompute(0);
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
+
+    if (threadIdx.x == 0) {
+        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
+            y_[row_idx] = static_cast<Tdata>(0.0f);
+            return;
+        }
+        Tcompute log_term = logf(block_sum) + max_val;
+        Tcompute target_logit = static_cast<Tcompute>(x[label]);
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+
+#endif
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
new file mode 100644
index 000000000..454b14617
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_MOORE_H__
+#define __CROSS_ENTROPY_MOORE_H__
+
+#include "../cross_entropy.h"
+
+DESCRIPTOR(moore)
+
+#endif
diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
new file mode 100644
index 000000000..2535679dd
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
@@ -0,0 +1,129 @@
+#include "../../../devices/moore/moore_common.h"
+#include "cross_entropy_moore.h"
+
+#include <cub/block/block_reduce.cuh>
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "cross_entropy_kernel.h"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+INFINIOP_MOORE_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+
+namespace op::cross_entropy::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+    (void)y_desc;
+
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, musaStream_t stream) {
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     const void *target,
+                                     void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+    (void)workspace;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cross_entropy::moore
diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
new file mode 100644
index 000000000..0ce3f4984
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
@@ -0,0 +1,107 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "cross_entropy_nvidia.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
+INFINIOP_CUDA_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+
+namespace op::cross_entropy::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+
+    CrossEntropyInfo info;
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+
+    auto internal = reinterpret_cast<device::nvidia::Handle *>(handle)->internal();
+
+    *desc_ptr = new Descriptor(
+        new Opaque{internal},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, cudaStream_t stream) {
+
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     const void *target,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    int max_threads = _opaque->internal->maxThreadsPerBlock();
+
+    if (max_threads >= 1024) {
+        CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream));
+    } else if (max_threads >= 512) {
+        CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream));
+    } else {
+        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cross_entropy::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
new file mode 100644
index 000000000..441e5b8d8
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_NVIDIA_H__
+#define __CROSS_ENTROPY_NVIDIA_H__
+
+#include "../cross_entropy.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/cross_entropy/operator.cc b/src/infiniop/ops/cross_entropy/operator.cc
new file mode 100644
index 000000000..75f35fcb7
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy/operator.cc
@@ -0,0 +1,174 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cross_entropy.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cross_entropy_cpu.h"
+#endif
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/cross_entropy_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/cross_entropy_moore.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cross_entropy_metax.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::cross_entropy::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc, x_desc, target_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
+    infiniopCrossEntropyDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopCrossEntropy(
+    infiniopCrossEntropyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, target, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
+    infiniopCrossEntropyDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                     \
+        delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_HYGON_API
+        DESTROY(INFINI_DEVICE_HYGON, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc
new file mode 100644
index 000000000..ff8ebe395
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc
@@ -0,0 +1,68 @@
+#include <cstdint>
+#include <type_traits>
+
+#include "equal_cpu.h"
+
+namespace op::equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    auto compute_dtype = a_desc->dtype();
+    auto out_dtype = out_desc->dtype();
+
+    if (compute_dtype != b_desc->dtype()) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
+                INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::equal::cpu
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h
new file mode 100644
index 000000000..fd811f4b0
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.h
@@ -0,0 +1,28 @@
+#ifndef __EQUAL_CPU_H__
+#define __EQUAL_CPU_H__
+
+#include <type_traits>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(equal, cpu)
+
+namespace op::equal::cpu {
+
+typedef struct EqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename Tout, typename Tin0, typename Tin1>
+    bool operator()(const Tin0 &a, const Tin1 &b) {
+        if constexpr (std::is_same_v<Tin0, Tin1>) {
+            return a == b;
+        } else {
+            return false;
+        }
+    }
+} EqualOp;
+
+} // namespace op::equal::cpu
+
+#endif
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
new file mode 100644
index 000000000..11ad5981e
--- /dev/null
+++ b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -0,0 +1,37 @@
+#ifndef __EQUAL_CUDA_H__
+#define __EQUAL_CUDA_H__
+
+#if defined(__MACACC__)
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
+#else
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+#include <type_traits>
+
+namespace op::equal::cuda {
+
+typedef struct EqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename Tout, typename Tin0, typename Tin1>
+    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
+        if constexpr (std::is_same_v<Tin0, Tin1>) {
+            if constexpr (std::is_same_v<Tin0, half2>) {
+                static_assert(!std::is_same_v<Tin0, half2>, "half2 is not supported for mixed output dtype");
+            } else if constexpr (std::is_same_v<Tin0, half>) {
+                return static_cast<Tout>(__heq(a, b));
+            } else {
+                return static_cast<Tout>(a == b);
+            }
+        } else {
+            return false;
+        }
+    }
+} EqualOp;
+
+} // namespace op::equal::cuda
+
+#endif
diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h
new file mode 100644
index 000000000..6e4cd64b9
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_METAX_API_H__
+#define __EQUAL_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(equal, metax)
+
+#endif // __EQUAL_METAX_API_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca
new file mode 100644
index 000000000..265e5b5a6
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.maca
@@ -0,0 +1,69 @@
+#include "equal_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::equal::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    auto compute_dtype = a_desc->dtype();
+    auto out_dtype = out_desc->dtype();
+
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
+
+    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::equal::metax
diff --git a/src/infiniop/ops/equal/moore/equal_moore.h b/src/infiniop/ops/equal/moore/equal_moore.h
new file mode 100644
index 000000000..2fed1bb40
--- /dev/null
+++ b/src/infiniop/ops/equal/moore/equal_moore.h
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_MOORE_API_H__
+#define __EQUAL_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(equal, moore)
+
+#endif // __EQUAL_MOORE_API_H__
diff --git a/src/infiniop/ops/equal/moore/equal_moore.mu b/src/infiniop/ops/equal/moore/equal_moore.mu
new file mode 100644
index 000000000..d0eb8395d
--- /dev/null
+++ b/src/infiniop/ops/equal/moore/equal_moore.mu
@@ -0,0 +1,140 @@
+#include "equal_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "equal_moore_kernel.h"
+
+namespace op::equal::moore {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    if (!info.isOutputContiguous()) {
+        return false;
+    }
+    const bool *input_contiguous = info.getInputContiguous();
+    const bool *input_broadcasted = info.getInputBroadcasted();
+    for (size_t i = 0; i < 2; ++i) {
+        if (!input_contiguous[i] || input_broadcasted[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename Tout, typename Tin>
+INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
+    const auto op = op::equal::moore::EqualOp{};
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    for (; idx < numel; idx += stride) {
+        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
+    }
+}
+
+template <typename Tout, typename Tin>
+infiniStatus_t launch_fast_path(size_t numel,
+                                void *output,
+                                const std::vector<const void *> &inputs,
+                                void *stream) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int kBlockSize = 256;
+    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
+    if (grid > 65535) {
+        grid = 65535;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
+        numel,
+        reinterpret_cast<Tout *>(output),
+        reinterpret_cast<const Tin *>(inputs[0]),
+        reinterpret_cast<const Tin *>(inputs[1]));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    auto compute_dtype = a_desc->dtype();
+    auto out_dtype = out_desc->dtype();
+
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
+
+    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (can_use_contiguous_fast_path(_info)) {
+        size_t numel = _info.getOutputSize();
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+            return launch_fast_path<bool, half>(numel, output, inputs, stream);
+        case INFINI_DTYPE_BF16:
+            return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
+        case INFINI_DTYPE_F32:
+            return launch_fast_path<bool, float>(numel, output, inputs, stream);
+        case INFINI_DTYPE_I32:
+            return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
+        case INFINI_DTYPE_I64:
+            return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
+        case INFINI_DTYPE_F64:
+            return launch_fast_path<bool, double>(numel, output, inputs, stream);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::equal::moore
diff --git a/src/infiniop/ops/equal/moore/equal_moore_kernel.h b/src/infiniop/ops/equal/moore/equal_moore_kernel.h
new file mode 100644
index 000000000..a4e32880b
--- /dev/null
+++ b/src/infiniop/ops/equal/moore/equal_moore_kernel.h
@@ -0,0 +1,30 @@
+#ifndef __EQUAL_MOORE_KERNEL_H__
+#define __EQUAL_MOORE_KERNEL_H__
+
+#include <type_traits>
+
+namespace op::equal::moore {
+
+typedef struct EqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename Tout, typename Tin0, typename Tin1>
+    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
+        if constexpr (std::is_same_v<Tin0, Tin1>) {
+            if constexpr (std::is_same_v<Tin0, half>) {
+                return __half2float(a) == __half2float(b);
+            } else if constexpr (std::is_same_v<Tin0, cuda_bfloat16>) {
+                return __bfloat162float(a) == __bfloat162float(b);
+            } else {
+                return a == b;
+            }
+        } else {
+            return false;
+        }
+    }
+} EqualOp;
+
+} // namespace op::equal::moore
+
+#endif // __EQUAL_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
new file mode 100644
index 000000000..5bdf92e6c
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
@@ -0,0 +1,137 @@
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "equal_nvidia.cuh"
+
+namespace {
+
+template <typename Tout, typename Tin>
+INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    op::equal::cuda::EqualOp op{};
+    for (; idx < n; idx += stride) {
+        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
+    }
+}
+
+template <typename Tout, typename Tin>
+infiniStatus_t launchFastEqualKernel(size_t numel,
+                                     void *output,
+                                     const std::vector<const void *> &inputs,
+                                     void *stream) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    constexpr int block = 256;
+    int grid = static_cast<int>((numel + block - 1) / block);
+    grid = std::min(grid, 65535);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    FastEqualKernel<Tout, Tin><<<grid, block, 0, cuda_stream>>>(
+        numel,
+        reinterpret_cast<Tout *>(output),
+        reinterpret_cast<const Tin *>(inputs[0]),
+        reinterpret_cast<const Tin *>(inputs[1]));
+    auto err = cudaGetLastError();
+    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
+}
+
+} // namespace
+
+namespace op::equal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    auto compute_dtype = a_desc->dtype();
+    auto out_dtype = out_desc->dtype();
+
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
+
+    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    bool fast_path = _info.isOutputContiguous();
+    if (fast_path) {
+        const bool *input_contiguous = _info.getInputContiguous();
+        const bool *input_broadcasted = _info.getInputBroadcasted();
+        for (size_t i = 0; i < 2; ++i) {
+            fast_path &= input_contiguous[i] && !input_broadcasted[i];
+        }
+    }
+
+    if (fast_path) {
+        size_t numel = _info.getOutputSize();
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+            return launchFastEqualKernel<bool, half>(numel, output, inputs, stream);
+        case INFINI_DTYPE_BF16:
+            return launchFastEqualKernel<bool, cuda_bfloat16>(numel, output, inputs, stream);
+        case INFINI_DTYPE_F32:
+            return launchFastEqualKernel<bool, float>(numel, output, inputs, stream);
+        case INFINI_DTYPE_I32:
+            return launchFastEqualKernel<bool, int32_t>(numel, output, inputs, stream);
+        case INFINI_DTYPE_I64:
+            return launchFastEqualKernel<bool, int64_t>(numel, output, inputs, stream);
+        case INFINI_DTYPE_F64:
+            return launchFastEqualKernel<bool, double>(numel, output, inputs, stream);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::equal::nvidia
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
new file mode 100644
index 000000000..96932dc3d
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_CUDA_API_H__
+#define __EQUAL_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(equal, nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc
new file mode 100644
index 000000000..80da07e01
--- /dev/null
+++ b/src/infiniop/ops/equal/operator.cc
@@ -0,0 +1,201 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/equal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/equal_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/equal_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/equal_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/equal_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::equal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                          \
+            {a_desc, b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopEqual(
+    infiniopEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t
+infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..f47198580
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,91 @@
+#include "hardswish_cpu.h"
+
+#include <cstddef>
+
+namespace op::hardswish::cpu {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
+}
+
+template <typename T>
+infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info,
+                                     void *output,
+                                     const std::vector<const void *> &inputs) {
+    const T *in = reinterpret_cast<const T *>(inputs[0]);
+    T *out = reinterpret_cast<T *>(output);
+    const ptrdiff_t size = static_cast<ptrdiff_t>(info.getOutputSize());
+
+#pragma omp parallel for if (size > 1024)
+    for (ptrdiff_t i = 0; i < size; ++i) {
+        out[i] = HardSwishOp{}(in[i]);
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const bool fast_path = can_use_contiguous_fast_path(_info);
+    if (fast_path) {
+        switch (_dtype) {
+        case INFINI_DTYPE_BF16:
+            return launch_contiguous_cpu<bf16_t>(_info, output, inputs);
+        case INFINI_DTYPE_F16:
+            return launch_contiguous_cpu<fp16_t>(_info, output, inputs);
+        case INFINI_DTYPE_F32:
+            return launch_contiguous_cpu<float>(_info, output, inputs);
+        case INFINI_DTYPE_F64:
+            return launch_contiguous_cpu<double>(_info, output, inputs);
+        default:
+            break;
+        }
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardSwishOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardSwishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardSwishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardSwishOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..b853663aa
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,50 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+#include <algorithm>
+#include <cmath>
+
+namespace op::hardswish::cpu {
+
+typedef struct HardSwishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        const float x_f = utils::cast<float>(x);
+        const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f);
+        const float result = x_f * clamped * (1.0f / 6.0f);
+        return utils::cast<T>(result);
+    }
+} HardSwishOp;
+
+typedef struct HardSwishContiguousOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+
+        T three = static_cast<T>(3);
+        T zero = static_cast<T>(0);
+        T six = static_cast<T>(6);
+
+        T scale = static_cast<T>(0.16666667f);
+
+        T val = x + three;
+
+        val = std::max(zero, val);
+        val = std::min(six, val);
+
+        return x * val * scale;
+    }
+} HardSwishContiguousOp;
+
+} // namespace op::hardswish::cpu
+
+#endif
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..21b6a5f8d
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,86 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cmath>
+#if defined(__MACACC__)
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
+#else
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+
+namespace op::hardswish::cuda {
+
+typedef struct HardSwishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+
+        if constexpr (std::is_same_v<T, half2>) {
+
+            const half2 three = __float2half2_rn(3.0f);
+            const half2 scale = __float2half2_rn(0.16666667f);
+
+            half2 val = __hadd2(x, three);
+
+#if defined(ENABLE_ILUVATAR_API)
+
+            float2 val_f = __half22float2(val);
+            val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f);
+            val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f);
+            val = __floats2half2_rn(val_f.x, val_f.y);
+#else
+
+            const half2 zero = __float2half2_rn(0.0f);
+            const half2 six = __float2half2_rn(6.0f);
+
+#if __CUDA_ARCH__ >= 800
+
+            val = __hmin2(__hmax2(val, zero), six);
+#else
+
+            val = __hmax2(val, zero);
+            val = __hmin2(val, six);
+#endif
+#endif
+
+            return __hmul2(__hmul2(x, val), scale);
+
+        }
+
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+
+            const float x_f = __bfloat162float(x);
+
+            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
+            return __float2bfloat16(x_f * val * 0.16666667f);
+
+        }
+
+        else if constexpr (std::is_same_v<T, half>) {
+            const float x_f = __half2float(x);
+            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
+            return __float2half(x_f * val * 0.16666667f);
+
+        }
+
+        else if constexpr (std::is_same_v<T, float>) {
+
+            const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
+            return x * val * 0.16666667f;
+
+        }
+
+        else if constexpr (std::is_same_v<T, double>) {
+            const double val = fmin(fmax(x + 3.0, 0.0), 6.0);
+            return x * val * (1.0 / 6.0);
+        }
+    }
+} HardSwishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
new file mode 100644
index 000000000..16b131aa9
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_METAX_API_H__
+#define __HARDSWISH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // __HARDSWISH_METAX_API_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
new file mode 100644
index 000000000..fc57a9b20
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
@@ -0,0 +1,58 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::hardswish::metax
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.h b/src/infiniop/ops/hardswish/moore/hardswish_moore.h
new file mode 100644
index 000000000..e5861a158
--- /dev/null
+++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.h
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_MOORE_API_H__
+#define __HARDSWISH_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, moore)
+
+#endif // __HARDSWISH_MOORE_API_H__
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu
new file mode 100644
index 000000000..3a1290b35
--- /dev/null
+++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu
@@ -0,0 +1,118 @@
+#include "hardswish_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "hardswish_moore_kernel.h"
+
+namespace op::hardswish::moore {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    return info.isOutputContiguous() && info.getInputSize() == 1 &&
+           info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
+}
+
+template <typename T>
+INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
+    const auto op = op::hardswish::moore::HardSwishOp{};
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    for (; idx < numel; idx += stride) {
+        out[idx] = op(in[idx]);
+    }
+}
+
+template <typename T>
+infiniStatus_t launch_fast_path(size_t numel,
+                                void *output,
+                                const std::vector<const void *> &inputs,
+                                void *stream) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int kBlockSize = 256;
+    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
+    if (grid > 65535) {
+        grid = 65535;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    hardswish_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
+        numel,
+        reinterpret_cast<T *>(output),
+        reinterpret_cast<const T *>(inputs[0]));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const bool fast_path = can_use_contiguous_fast_path(_info);
+    if (fast_path) {
+        switch (_dtype) {
+        case INFINI_DTYPE_BF16:
+            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F16:
+            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F32:
+            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F64:
+            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
+        default:
+            break;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::hardswish::moore
diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
new file mode 100644
index 000000000..60e3dbc60
--- /dev/null
+++ b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
@@ -0,0 +1,39 @@
+#ifndef __HARDSWISH_MOORE_KERNEL_H__
+#define __HARDSWISH_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <type_traits>
+
+namespace op::hardswish::moore {
+
+typedef struct HardSwishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half>) {
+            float x_f = __half2float(x);
+            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
+            return __float2half(x_f * val * 0.16666667f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float x_f = __bfloat162float(x);
+            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
+            return __float2bfloat16_rn(x_f * val * 0.16666667f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
+            return x * val * 0.16666667f;
+        } else if constexpr (std::is_same_v<T, double>) {
+            double val = fmin(fmax(x + 3.0, 0.0), 6.0);
+            return x * val * (1.0 / 6.0);
+        } else {
+            float x_f = static_cast<float>(x);
+            float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
+            return static_cast<T>(x_f * val * 0.16666667f);
+        }
+    }
+} HardSwishOp;
+
+} // namespace op::hardswish::moore
+
+#endif // __HARDSWISH_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..f7736a7fd
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,115 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+#include <cuda_runtime.h>
+
+namespace op::hardswish::nvidia {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
+}
+
+template <typename T>
+__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
+    const auto op = op::hardswish::cuda::HardSwishOp{};
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    while (idx < numel) {
+        out[idx] = op(in[idx]);
+        idx += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename T>
+infiniStatus_t launch_fast_path(size_t numel,
+                                void *output,
+                                const std::vector<const void *> &inputs,
+                                void *stream) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int BLOCK_SIZE = 256;
+    int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    grid = std::min(grid, 65535);
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    hardswish_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr);
+    cudaError_t err = cudaGetLastError();
+    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
+}
+
+} // namespace
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const bool fast_path = can_use_contiguous_fast_path(_info);
+    if (fast_path) {
+        switch (_dtype) {
+        case INFINI_DTYPE_BF16:
+            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F16:
+            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F32:
+            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
+        case INFINI_DTYPE_F64:
+            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
+        default:
+            break;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..e544591dc
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_CUDA_API_H__
+#define __HARDSWISH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..ddce97f16
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/hardswish_moore.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardswish_metax.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardSwishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopHardSwish(
+    infiniopHardSwishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
new file mode 100644
index 000000000..1bd276308
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
@@ -0,0 +1,124 @@
+#include "hardtanh_cpu.h"
+
+#include <type_traits>
+
+namespace op::hardtanh::cpu {
+
+Descriptor::Descriptor(infiniDtype_t dtype,
+                       op::elementwise::ElementwiseInfo info,
+                       size_t workspace_size,
+                       infiniDevice_t device_type,
+                       int device_id,
+                       float min_val,
+                       float max_val)
+    : InfiniopDescriptor{device_type, device_id},
+      _dtype(dtype),
+      _info(std::move(info)),
+      _workspace_size(workspace_size),
+      _min_val(min_val),
+      _max_val(max_val) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float min_val,
+    float max_val) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        info_result.take(),
+        0,
+        handle->device,
+        handle->device_id,
+        min_val,
+        max_val);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info,
+                                        void *output,
+                                        const std::vector<const void *> &inputs,
+                                        float min_val,
+                                        float max_val) {
+    if (inputs.empty()) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    T *out = reinterpret_cast<T *>(output);
+    const T *in = reinterpret_cast<const T *>(inputs[0]);
+    const auto ndim = info.getNdim();
+    const auto *output_shape = info.getOutputShape();
+    const auto *output_strides = info.getOutputStrides();
+    const auto *input_shape = info.getInputShape(0);
+    const auto *input_strides = info.getInputStrides(0);
+    const auto *input_contiguous = info.getInputContiguous();
+    ptrdiff_t output_size = info.getOutputSize();
+
+#pragma omp parallel for if (output_size > 1024)
+    for (ptrdiff_t i = 0; i < output_size; ++i) {
+        const size_t out_idx = info.isOutputContiguous()
+                                 ? static_cast<size_t>(i)
+                                 : op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides);
+        const size_t in_idx = input_contiguous[0]
+                                ? static_cast<size_t>(i)
+                                : op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides);
+
+        if constexpr (std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>) {
+            float value = utils::cast<float>(in[in_idx]);
+            float clamped = HardTanhOp{}(value, min_val, max_val);
+            out[out_idx] = utils::cast<T>(clamped);
+        } else {
+            out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    if (inputs.size() != 1) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return launchCpuHardTanh<bf16_t>(_info, output, inputs, _min_val, _max_val);
+    case INFINI_DTYPE_F16:
+        return launchCpuHardTanh<fp16_t>(_info, output, inputs, _min_val, _max_val);
+    case INFINI_DTYPE_F32:
+        return launchCpuHardTanh<float>(_info, output, inputs, _min_val, _max_val);
+    case INFINI_DTYPE_F64:
+        return launchCpuHardTanh<double>(_info, output, inputs, _min_val, _max_val);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::hardtanh::cpu
diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
new file mode 100644
index 000000000..09bfb340c
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
@@ -0,0 +1,63 @@
+#ifndef __HARDTANH_CPU_H__
+#define __HARDTANH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+namespace op::hardtanh::cpu {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    size_t _workspace_size;
+    float _min_val;
+    float _max_val;
+
+    Descriptor(infiniDtype_t dtype,
+               op::elementwise::ElementwiseInfo info,
+               size_t workspace_size,
+               infiniDevice_t device_type,
+               int device_id,
+               float min_val,
+               float max_val);
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t out_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+        float min_val,
+        float max_val);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float minVal() const { return _min_val; }
+    float maxVal() const { return _max_val; }
+};
+
+typedef struct HardTanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x, float min_val, float max_val) const {
+        T low = static_cast<T>(min_val);
+        T high = static_cast<T>(max_val);
+        T val = x < low ? low : x;
+        return val > high ? high : val;
+    }
+} HardTanhOp;
+
+} // namespace op::hardtanh::cpu
+
+#endif
diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
new file mode 100644
index 000000000..28987f82c
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/cuda/kernel.cuh
@@ -0,0 +1,51 @@
+#ifndef __HARDTANH_CUDA_H__
+#define __HARDTANH_CUDA_H__
+
+#if defined(__MACACC__)
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
+#else
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+#include <type_traits>
+
+namespace op::hardtanh::cuda {
+
+typedef struct HardTanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
+        if constexpr (std::is_same_v<T, half2>) {
+
+            float2 x_f2 = __half22float2(x);
+            x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x));
+            x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y));
+            return __float22half2_rn(x_f2);
+
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+
+            float x_f = __bfloat162float(x);
+            return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f)));
+
+        } else if constexpr (std::is_same_v<T, half>) {
+
+            float x_f = __half2float(x);
+            return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
+
+        } else if constexpr (std::is_same_v<T, float>) {
+
+            return fminf(max_val, fmaxf(min_val, x));
+
+        } else if constexpr (std::is_same_v<T, double>) {
+
+            return fmin((double)max_val, fmax((double)min_val, x));
+        }
+    }
+} HardTanhOp;
+
+} // namespace op::hardtanh::cuda
+
+#endif
diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
new file mode 100644
index 000000000..182157116
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
@@ -0,0 +1,48 @@
+#ifndef __HARDTANH_METAX_API_H__
+#define __HARDTANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+namespace op::hardtanh::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _min_val;
+    float _max_val;
+
+    Descriptor(infiniDtype_t dtype,
+               op::elementwise::ElementwiseInfo info,
+               op::elementwise::metax::DeviceImpl *device_info,
+               size_t workspace_size,
+               infiniDevice_t device_type,
+               int device_id,
+               float min_val,
+               float max_val);
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t out_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+        float min_val,
+        float max_val);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+};
+
+} // namespace op::hardtanh::metax
+
+#endif // __HARDTANH_METAX_API_H__
diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
new file mode 100644
index 000000000..596316e23
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
@@ -0,0 +1,95 @@
+#include "hardtanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardtanh::metax {
+
+Descriptor::Descriptor(infiniDtype_t dtype,
+                       op::elementwise::ElementwiseInfo info,
+                       op::elementwise::metax::DeviceImpl *device_info,
+                       size_t workspace_size,
+                       infiniDevice_t device_type,
+                       int device_id,
+                       float min_val,
+                       float max_val)
+    : InfiniopDescriptor{device_type, device_id},
+      _dtype(dtype),
+      _info(std::move(info)),
+      _device_info(device_info),
+      _workspace_size(workspace_size),
+      _min_val(min_val),
+      _max_val(max_val) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float min_val,
+    float max_val) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        device_impl_result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id,
+        min_val,
+        max_val);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(
+            _info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardTanhOp, half>(
+            _info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardTanhOp, float>(
+            _info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardTanhOp, double>(
+            _info, workspace, output, inputs, stream, _min_val, _max_val);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::hardtanh::metax
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
new file mode 100644
index 000000000..470790d52
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
@@ -0,0 +1,51 @@
+#ifndef __HARDTANH_MOORE_API_H__
+#define __HARDTANH_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+namespace op::hardtanh::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _min_val;
+    float _max_val;
+
+    Descriptor(infiniDtype_t dtype,
+               op::elementwise::ElementwiseInfo info,
+               op::elementwise::moore::DeviceImpl *device_info,
+               size_t workspace_size,
+               infiniDevice_t device_type,
+               int device_id,
+               float min_val,
+               float max_val);
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t out_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+        float min_val,
+        float max_val);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float minVal() const { return _min_val; }
+    float maxVal() const { return _max_val; }
+};
+
+} // namespace op::hardtanh::moore
+
+#endif // __HARDTANH_MOORE_API_H__
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
new file mode 100644
index 000000000..40e3dbe41
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
@@ -0,0 +1,158 @@
+#include "hardtanh_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "hardtanh_moore_kernel.h"
+
+namespace op::hardtanh::moore {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    return info.isOutputContiguous() && info.getInputSize() == 1 &&
+           info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
+}
+
+template <typename T>
+INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel,
+                                                 T *out,
+                                                 const T *in,
+                                                 float min_val,
+                                                 float max_val) {
+    const auto op = op::hardtanh::moore::HardTanhOp{};
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    for (; idx < numel; idx += stride) {
+        out[idx] = op(in[idx], min_val, max_val);
+    }
+}
+
+template <typename T>
+infiniStatus_t launch_fast_path(size_t numel,
+                                void *output,
+                                const std::vector<const void *> &inputs,
+                                void *stream,
+                                float min_val,
+                                float max_val) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int kBlockSize = 256;
+    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
+    if (grid > 65535) {
+        grid = 65535;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    hardtanh_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
+        numel,
+        reinterpret_cast<T *>(output),
+        reinterpret_cast<const T *>(inputs[0]),
+        min_val,
+        max_val);
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+Descriptor::Descriptor(infiniDtype_t dtype,
+                       op::elementwise::ElementwiseInfo info,
+                       op::elementwise::moore::DeviceImpl *device_info,
+                       size_t workspace_size,
+                       infiniDevice_t device_type,
+                       int device_id,
+                       float min_val,
+                       float max_val)
+    : InfiniopDescriptor{device_type, device_id},
+      _dtype(dtype),
+      _info(std::move(info)),
+      _device_info(device_info),
+      _workspace_size(workspace_size),
+      _min_val(min_val),
+      _max_val(max_val) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float min_val,
+    float max_val) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        device_impl_result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id,
+        min_val,
+        max_val);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const bool fast_path = can_use_contiguous_fast_path(_info);
+    if (fast_path) {
+        switch (_dtype) {
+        case INFINI_DTYPE_BF16:
+            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F16:
+            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F32:
+            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F64:
+            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        default:
+            break;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::hardtanh::moore
diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
new file mode 100644
index 000000000..db0a3c024
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
@@ -0,0 +1,34 @@
+#ifndef __HARDTANH_MOORE_KERNEL_H__
+#define __HARDTANH_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <type_traits>
+
+namespace op::hardtanh::moore {
+
+typedef struct HardTanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
+        if constexpr (std::is_same_v<T, half>) {
+            float x_f = __half2float(x);
+            return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float x_f = __bfloat162float(x);
+            return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fminf(max_val, fmaxf(min_val, x));
+        } else if constexpr (std::is_same_v<T, double>) {
+            return fmin((double)max_val, fmax((double)min_val, x));
+        } else {
+            float x_f = static_cast<float>(x);
+            return static_cast<T>(fminf(max_val, fmaxf(min_val, x_f)));
+        }
+    }
+} HardTanhOp;
+
+} // namespace op::hardtanh::moore
+
+#endif // __HARDTANH_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
new file mode 100644
index 000000000..31ba489ab
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
@@ -0,0 +1,150 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardtanh_nvidia.cuh"
+
+#include <cuda_runtime.h>
+
+namespace op::hardtanh::nvidia {
+namespace {
+
+inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
+    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
+}
+
+template <typename T>
+__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) {
+    const auto op = op::hardtanh::cuda::HardTanhOp{};
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    while (idx < numel) {
+        out[idx] = op(in[idx], min_val, max_val);
+        idx += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename T>
+infiniStatus_t launch_fast_path(size_t numel,
+                                void *output,
+                                const std::vector<const void *> &inputs,
+                                void *stream,
+                                float min_val,
+                                float max_val) {
+    if (numel == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int BLOCK_SIZE = 256;
+    int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    grid = std::min(grid, 65535);
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    hardtanh_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr, min_val, max_val);
+    cudaError_t err = cudaGetLastError();
+    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
+}
+
+} // namespace
+
+Descriptor::Descriptor(infiniDtype_t dtype,
+                       op::elementwise::ElementwiseInfo info,
+                       op::elementwise::nvidia::DeviceImpl *device_info,
+                       size_t workspace_size,
+                       infiniDevice_t device_type,
+                       int device_id,
+                       float min_val,
+                       float max_val)
+    : InfiniopDescriptor{device_type, device_id},
+      _dtype(dtype),
+      _info(std::move(info)),
+      _device_info(device_info),
+      _workspace_size(workspace_size),
+      _min_val(min_val),
+      _max_val(max_val) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float min_val,
+    float max_val) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal());
+    CHECK_RESULT(device_impl_result);
+
+    *desc_ptr = new Descriptor(
+        dtype,
+        std::move(info),
+        device_impl_result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id,
+        min_val,
+        max_val);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const bool fast_path = can_use_contiguous_fast_path(_info);
+    if (fast_path) {
+        switch (_dtype) {
+        case INFINI_DTYPE_BF16:
+            return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F16:
+            return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F32:
+            return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        case INFINI_DTYPE_F64:
+            return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
+        default:
+            break;
+        }
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardtanh::nvidia
diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
new file mode 100644
index 000000000..ebd27d80e
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
@@ -0,0 +1,51 @@
+#ifndef __HARDTANH_CUDA_API_H__
+#define __HARDTANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+namespace op::hardtanh::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _min_val;
+    float _max_val;
+
+    Descriptor(infiniDtype_t dtype,
+               op::elementwise::ElementwiseInfo info,
+               op::elementwise::nvidia::DeviceImpl *device_info,
+               size_t workspace_size,
+               infiniDevice_t device_type,
+               int device_id,
+               float min_val,
+               float max_val);
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t out_desc,
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+        float min_val,
+        float max_val);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float minVal() const { return _min_val; }
+    float maxVal() const { return _max_val; }
+};
+
+} // namespace op::hardtanh::nvidia
+
+#endif
diff --git a/src/infiniop/ops/hardtanh/operator.cc b/src/infiniop/ops/hardtanh/operator.cc
new file mode 100644
index 000000000..f3c782224
--- /dev/null
+++ b/src/infiniop/ops/hardtanh/operator.cc
@@ -0,0 +1,161 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardtanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardtanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardtanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardtanh_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/hardtanh_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    float min_val,
+    float max_val) {
+
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::hardtanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                             \
+            reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                        \
+            {input_desc},                                                       \
+            min_val,                                                            \
+            max_val)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopHardTanh(
+    infiniopHardTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                     \
+        return reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t
+infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        delete reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h
index 05a5c2fca..23be702ff 100644
--- a/src/utils/custom_types.h
+++ b/src/utils/custom_types.h
@@ -13,6 +13,22 @@ struct CustomBFloat16 {
 };
 typedef struct CustomBFloat16 bf16_t;
 
+inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
+    return lhs._v == rhs._v;
+}
+
+inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
+    return !(lhs == rhs);
+}
+
+inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
+    return lhs._v == rhs._v;
+}
+
+inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
+    return !(lhs == rhs);
+}
+
 float _f16_to_f32(fp16_t val);
 fp16_t _f32_to_f16(float val);
 
diff --git a/test/infinicore/ops/avg_pool1d.py b/test/infinicore/ops/avg_pool1d.py
index 5a0318571..539951628 100644
--- a/test/infinicore/ops/avg_pool1d.py
+++ b/test/infinicore/ops/avg_pool1d.py
@@ -74,9 +74,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.avg_pool1d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/cross_entropy.py b/test/infinicore/ops/cross_entropy.py
index e71a30567..269216bc7 100644
--- a/test/infinicore/ops/cross_entropy.py
+++ b/test/infinicore/ops/cross_entropy.py
@@ -11,6 +11,8 @@
 # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None)
 # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean')
 
+# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。
+# 仍然保留原始配置，后续实现这些特性时只需放开过滤条件即可。
 _TEST_CASES_DATA = [
     ((4, 5), (4,), None, False, None),
     ((8, 10), (8,), None, True, -1),
@@ -20,6 +22,9 @@
     ((2, 2), (2,), None, True, -100),
 ]
 
+_SUPPORT_WEIGHT = False
+_SUPPORT_IGNORE_INDEX = False
+
 _TOLERANCE_MAP = {
     infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
     infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
@@ -40,6 +45,11 @@ def parse_test_cases():
     ) in _TEST_CASES_DATA:
         for dtype in _TENSOR_DTYPES:
             tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            if weight_present and not _SUPPORT_WEIGHT:
+                continue
+            if ignore_index is not None and not _SUPPORT_IGNORE_INDEX:
+                continue
+
             logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype)
             target = TensorSpec.from_tensor(
                 target_shape,
@@ -51,7 +61,7 @@ def parse_test_cases():
             )
 
             inputs = [logits, target]
-            kwargs = {}
+            kwargs = {"reduction": "none"}
             if weight_present:
                 weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype)
                 inputs.append(weight_spec)
@@ -84,9 +94,10 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.cross_entropy(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.cross_entropy(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation."""
+        out = kwargs.pop("out", None)
+        return infinicore.cross_entropy(*args, out=out, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/equal.py b/test/infinicore/ops/equal.py
index 10aae3fcb..fd5c37261 100644
--- a/test/infinicore/ops/equal.py
+++ b/test/infinicore/ops/equal.py
@@ -74,8 +74,11 @@ def parse_test_cases():
                     )
                 )
 
-            # in-place a
-            if a_supports_inplace:
+            # Equal 结果为 bool，无法安全复用浮点/整型输入作为输出缓冲区。
+            # 只有当输入 dtype 本身为 bool 时才允许 inplace，这里提前留出开关。
+            allow_input_inplace = dtype == infinicore.bool
+
+            if allow_input_inplace and a_supports_inplace:
                 test_cases.append(
                     TestCase(
                         inputs=[a_spec, b_spec],
@@ -87,8 +90,7 @@ def parse_test_cases():
                     )
                 )
 
-            # in-place b
-            if b_supports_inplace:
+            if allow_input_inplace and b_supports_inplace:
                 test_cases.append(
                     TestCase(
                         inputs=[a_spec, b_spec],
@@ -115,9 +117,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.eq(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.eq(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.equal(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/hardswish.py b/test/infinicore/ops/hardswish.py
index 9f31cdc62..5ab38d594 100644
--- a/test/infinicore/ops/hardswish.py
+++ b/test/infinicore/ops/hardswish.py
@@ -70,9 +70,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.hardswish(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.hardswish(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.nn.functional.hardswish(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/hardtanh.py b/test/infinicore/ops/hardtanh.py
index 6861e464e..a88ea6c8d 100644
--- a/test/infinicore/ops/hardtanh.py
+++ b/test/infinicore/ops/hardtanh.py
@@ -17,7 +17,6 @@
 
 _TEST_CASES_DATA = [
     ((13, 4), None, -1.0, 1.0),
-    ((13, 4), (10, 1), -0.5, 0.5),
     ((8, 8, 8), None, -2.0, 2.0),
 ]
 
@@ -87,9 +86,11 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.hardtanh(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.hardtanh(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation."""
+        import infinicore.nn.functional as F
+
+        return F.hardtanh(*args, **kwargs)
 
 
 def main():
diff --git a/test/infiniop/avg_pool1d.py b/test/infiniop/avg_pool1d.py
new file mode 100644
index 000000000..dd9e771c0
--- /dev/null
+++ b/test/infiniop/avg_pool1d.py
@@ -0,0 +1,183 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+_TEST_CASES = [
+    # input_shape, x_stride, y_stride, kernel_size, stride, padding
+    ((2, 3, 16), None, None, 3, None, 0),
+    ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2),
+    ((2, 1, 32), None, (32, 16, 1), 2, 2, 0),
+    ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1),
+    ((4, 6, 31), None, None, 4, 2, 1),
+    ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def _effective_stride(stride, kernel_size):
+    if stride in (None, 0):
+        return kernel_size
+    return stride
+
+
+def _compute_output_shape(input_shape, kernel_size, stride, padding):
+    stride = _effective_stride(stride, kernel_size)
+    width = input_shape[2]
+    out_width = (width + 2 * padding - kernel_size) // stride + 1
+    return (input_shape[0], input_shape[1], out_width)
+
+
+def avg_pool1d_ref(x, kernel_size, stride, padding):
+    stride = _effective_stride(stride, kernel_size)
+    out = torch.nn.functional.avg_pool1d(
+        x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding
+    )
+    return out.to(x.dtype)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    x_stride,
+    y_stride,
+    kernel_size,
+    stride,
+    padding,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    stride_value = _effective_stride(stride, kernel_size)
+    out_shape = _compute_output_shape(
+        input_shape, kernel_size, stride_value, padding
+    )
+    print(
+        f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, "
+        f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, "
+        f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(input_shape, x_stride, dtype, device)
+    y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros")
+
+    ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPool1dDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            kernel_size,
+            stride_value,
+            padding,
+        )
+    )
+
+    # Invalidate descriptors in tensors after creation to make sure kernels read from arguments
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_avg_pool1d():
+        check_error(
+            LIBINFINIOP.infiniopAvgPool1d(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_avg_pool1d()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation(
+            "PyTorch",
+            lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_avg_pool1d(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
+
diff --git a/test/infiniop/cross_entropy.py b/test/infiniop/cross_entropy.py
new file mode 100644
index 000000000..987f2d11a
--- /dev/null
+++ b/test/infiniop/cross_entropy.py
@@ -0,0 +1,106 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ------------------------------------------------------------
+# 用例配置
+# ------------------------------------------------------------
+_TEST_CASES_ = [
+    ((2, 4, 10), None, None),        # logits shape, x_stride, y_stride
+    ((1, 128, 32000), None, None),
+    ((4, 512, 1000), None, None),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+}
+
+# ------------------------------------------------------------
+# PyTorch 参考实现
+# ------------------------------------------------------------
+def cross_entropy_ref(logits, target):
+    vocab = logits.shape[-1]
+    logits_flat = logits.reshape(-1, vocab).float()
+    target_flat = target.reshape(-1).long()
+    loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none")
+    return loss.view(target.shape).to(logits.dtype)
+
+
+def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None):
+    logits_shape = shape
+    label_shape = shape[:-1]
+    vocab = shape[-1]
+
+    print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}")
+
+    x = TestTensor(logits_shape, x_stride, dtype, device)
+    target = TestTensor(label_shape, None, InfiniDtype.I64, device)
+
+    # 生成有效标签
+    tgt = target.torch_tensor()
+    tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device))
+    target.actual_tensor().copy_(tgt)
+
+    reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor())
+    y = TestTensor(label_shape, y_stride, dtype, device)
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor
+        )
+    )
+
+    for tensor in [x, y, target]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def run():
+        check_error(
+            LIBINFINIOP.infiniopCrossEntropy(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                target.data(),
+                None,
+            )
+        )
+
+    run()
+    if sync:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol)
+
+    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py
new file mode 100644
index 000000000..e333b94b3
--- /dev/null
+++ b/test/infiniop/equal.py
@@ -0,0 +1,181 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool，内存大小不同)
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# 测试的输入数据类型
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
+
+# 容差设置 (对于 Bool 比较，通常要求完全匹配)
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# PyTorch 标准实现
+def equal_func(c, a, b):
+    torch.eq(a, b, out=c)
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    # 输入 Tensor 使用指定的 dtype (如 float16)
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    
+    # [关键修改] 输出 Tensor 强制使用 Bool 类型
+    # 注意：这里 c_stride 如果是按字节计算的，对于 Bool 类型通常是 1 byte
+    c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL"
+    )
+
+    # 运行 PyTorch 对照组
+    equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    
+    # [关键修改] 调用 Equal 的 Create 函数
+    check_error(
+        LIBINFINIOP.infiniopCreateEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor, # Output (Bool)
+            a.descriptor, # Input A
+            b.descriptor, # Input B
+        )
+    )
+
+    # Invalidate descriptors
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_equal():
+        check_error(
+            LIBINFINIOP.infiniopEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_equal()
+
+    # 使用 Bool 类型的容差 (实际上就是全等)
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL)
+    
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    
+    # 验证结果
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+        
+    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..f805b8aad
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,171 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# 复用相同的测试用例配置，因为 HardSwish 也是逐元素操作
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    new_output = torch.nn.functional.hardswish(input.torch_tensor())
+    output.update_torch_tensor(new_output)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    
+    check_error(
+        LIBINFINIOP.infiniopCreateHardSwishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardswish():
+        check_error(
+            LIBINFINIOP.infiniopHardSwish(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardswish()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    
+    check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/hardtanh.py b/test/infiniop/hardtanh.py
new file mode 100644
index 000000000..573ba9485
--- /dev/null
+++ b/test/infiniop/hardtanh.py
@@ -0,0 +1,169 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_float
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((16, 5632), None, None),
+    ((4, 4, 5632), None, None),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+# HardTanh 特有的参数测试组合 (min_val, max_val)
+_PARAM_CASES = [
+    (-1.0, 1.0),
+    (0.0, 6.0), # 类似于 ReLU6
+    (-2.5, 2.5),
+]
+
+# 组合所有测试用例：shape + inplace + params
+_TEST_CASES = [
+    test_case + (inplace_item, p_min, p_max)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+    for p_min, p_max in _PARAM_CASES
+]
+
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    min_val=-1.0,
+    max_val=1.0,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]"
+    )
+
+    # 计算 PyTorch 真值
+    new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val)
+    output.update_torch_tensor(new_output)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+
+    check_error(
+        LIBINFINIOP.infiniopCreateHardTanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            c_float(min_val),
+            c_float(max_val),
+        )
+    )
+
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardtanh():
+        check_error(
+            LIBINFINIOP.infiniopHardTanh(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardtanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("   lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        
+    check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mHardTanh Test passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 275689e78..8aeba0100 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -54,6 +54,54 @@ def add_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+@OpRegister.operator
+def equal_(lib):
+    # =========================================================
+    # 1. 注册 Create 函数
+    # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc)
+    # =========================================================
+    lib.infiniopCreateEqualDescriptor.restype = c_int32
+    lib.infiniopCreateEqualDescriptor.argtypes = [
+        infiniopHandle_t,                     # handle
+        POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出)
+        infiniopTensorDescriptor_t,           # output (c)
+        infiniopTensorDescriptor_t,           # input_a
+        infiniopTensorDescriptor_t,           # input_b
+    ]
+
+    # =========================================================
+    # 2. 注册 GetWorkspaceSize 函数
+    # C函数签名: (desc, &size)
+    # =========================================================
+    lib.infiniopGetEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    # =========================================================
+    # 3. 注册 Execute (计算) 函数
+    # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream)
+    # =========================================================
+    lib.infiniopEqual.restype = c_int32
+    lib.infiniopEqual.argtypes = [
+        infiniopOperatorDescriptor_t, # desc
+        c_void_p,                     # workspace ptr
+        c_size_t,                     # workspace size
+        c_void_p,                     # output data ptr
+        c_void_p,                     # input a data ptr
+        c_void_p,                     # input b data ptr
+        c_void_p,                     # stream
+    ]
+
+    # =========================================================
+    # 4. 注册 Destroy 函数
+    # C函数签名: (desc)
+    # =========================================================
+    lib.infiniopDestroyEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyEqualDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
 
 @OpRegister.operator
 def attention_(lib):
@@ -162,6 +210,40 @@ def clip_(lib):
     ]
 
 
+@OpRegister.operator
+def cross_entropy_(lib):
+    lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCrossEntropy.restype = c_int32
+    lib.infiniopCrossEntropy.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def logsoftmax_(lib):
     lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32
@@ -909,6 +991,112 @@ def silu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+@OpRegister.operator
+def hardtanh_(lib):
+    # 1. Create Descriptor - 注意增加了两个 c_float 参数
+    lib.infiniopCreateHardTanhDescriptor.restype = c_int32
+    lib.infiniopCreateHardTanhDescriptor.argtypes = [
+        infiniopHandle_t,               # handle
+        POINTER(infiniopOperatorDescriptor_t), # desc_ptr
+        infiniopTensorDescriptor_t,     # output
+        infiniopTensorDescriptor_t,     # input
+        c_float,                        # min_val
+        c_float,                        # max_val
+    ]
+
+    # 2. Get Workspace Size
+    lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardTanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,   # desc
+        POINTER(c_size_t),              # size
+    ]
+
+    # 3. Execute Operator
+    lib.infiniopHardTanh.restype = c_int32
+    lib.infiniopHardTanh.argtypes = [
+        infiniopOperatorDescriptor_t,   # desc
+        c_void_p,                       # workspace
+        c_size_t,                       # workspace_size
+        c_void_p,                       # output
+        c_void_p,                       # input
+        c_void_p,                       # stream
+    ]
+
+    # 4. Destroy Descriptor
+    lib.infiniopDestroyHardTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyHardTanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,   # desc
+    ]
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardSwishDescriptor.restype = c_int32
+    lib.infiniopCreateHardSwishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardSwishWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopHardSwish.restype = c_int32
+    lib.infiniopHardSwish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyHardSwishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardSwishDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def avg_pool1d_(lib):
+    # 1. Create 函数
+    # C签名: (handle, *desc, y, x, kernel_size, stride, padding)
+    lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPool1dDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # y_desc (Output)
+        infiniopTensorDescriptor_t,  # x_desc (Input)
+        c_size_t,                    # kernel_size
+        c_size_t,                    # stride
+        c_size_t,                    # padding
+    ]
+
+    # 2. GetWorkspaceSize 函数
+    lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    # 3. Execute 函数
+    lib.infiniopAvgPool1d.restype = c_int32
+    lib.infiniopAvgPool1d.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # y (output pointer)
+        c_void_p,  # x (input pointer)
+        c_void_p,  # stream
+    ]
+
+    # 4. Destroy 函数
+    lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
 
 @OpRegister.operator
 def layer_norm_(lib):
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index ec8763a4e..b690e74d4 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -83,8 +83,12 @@ def __init__(
                 InfiniDtype.BYTE,
                 InfiniDtype.BOOL,
             ]:
-                randint_low = -2000000000 if randint_low is None else randint_low
-                randint_high = 2000000000 if randint_high is None else randint_high
+                if dt == InfiniDtype.BOOL:
+                    randint_low = 0 if randint_low is None else randint_low
+                    randint_high = 2 if randint_high is None else randint_high
+                else:
+                    randint_low = -2000000000 if randint_low is None else randint_low
+                    randint_high = 2000000000 if randint_high is None else randint_high
                 self._torch_tensor = torch.randint(
                     randint_low,
                     randint_high,