diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 5274dde48..e3ca88797 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -5,12 +5,16 @@ #include "ops/add_rms_norm.hpp" #include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/avg_pool1d.hpp" #include "ops/baddbmm.hpp" #include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" +#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" #include "ops/fmod.hpp" +#include "ops/hardswish.hpp" +#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" diff --git a/include/infinicore/ops/avg_pool1d.hpp b/include/infinicore/ops/avg_pool1d.hpp new file mode 100644 index 000000000..4bf69bc2a --- /dev/null +++ b/include/infinicore/ops/avg_pool1d.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class AvgPool1d { +public: + using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t); + static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding); + static common::OpDispatcher &dispatcher(); +}; + +Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); +void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp new file mode 100644 index 000000000..958ee1089 --- /dev/null +++ b/include/infinicore/ops/cross_entropy.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class CrossEntropy { +public: + // Schema 定义:函数指针类型 + // CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels) + using schema = void (*)(Tensor, Tensor, Tensor); + + // 执行入口 + static void execute(Tensor output, Tensor input, Tensor target); + + // 分发器访问接口 + static common::OpDispatcher &dispatcher(); +}; + +// ================================================================== +// 对外 Functional API +// ================================================================== + +// 1. Out-of-place 接口: +// 输入 Logits 和 Target,内部自动创建 Output Tensor 并返回 +Tensor cross_entropy(Tensor input, Tensor target); + +// 2. Explicit Output 接口 (类似于 In-place 风格): +// 用户显式提供 Output Tensor 用于存储结果 +// 注意:虽然命名带有下划线 _,但通常 CrossEntropy 无法真正原地修改 input, +// 所以这里只是表示“写入指定的 output 内存” +void cross_entropy_(Tensor output, Tensor input, Tensor target); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/equal.hpp b/include/infinicore/ops/equal.hpp new file mode 100644 index 000000000..1a158bf1e --- /dev/null +++ b/include/infinicore/ops/equal.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Equal { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + + static void execute(Tensor out, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor equal(Tensor a, Tensor b); +void equal_(Tensor out, Tensor a, Tensor b); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/hardswish.hpp b/include/infinicore/ops/hardswish.hpp new file mode 100644 index 000000000..15313f461 --- /dev/null +++ b/include/infinicore/ops/hardswish.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Hardswish { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor hardswish(Tensor input); +void hardswish_(Tensor output, Tensor input); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/hardtanh.hpp b/include/infinicore/ops/hardtanh.hpp new file mode 100644 index 000000000..511408fee --- /dev/null +++ b/include/infinicore/ops/hardtanh.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class HardTanh { +public: + using schema = void (*)(Tensor, Tensor, float, float); + static void execute(Tensor output, Tensor input, float min_val, float max_val); + static common::OpDispatcher &dispatcher(); +}; + +Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f); +void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 4217183f7..a73bd20a0 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -45,4 +45,10 @@ #include "infiniop/ops/zeros.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/cross_entropy.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/avg_pool1d.h" +#include "infiniop/ops/equal.h" +#include "infiniop/ops/hardtanh.h" + #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h new file mode 100644 index 000000000..5c0bdf6ea --- /dev/null +++ b/include/infiniop/ops/avg_pool1d.h @@ -0,0 +1,32 @@ +#ifndef __INFINIOP_AVG_POOL1D_API_H__ +#define __INFINIOP_AVG_POOL1D_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor( + infiniopHandle_t handle, + infiniopAvgPool1dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + size_t kernel_size, + size_t stride, + size_t padding); + +__INFINI_C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize( + infiniopAvgPool1dDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopAvgPool1d( + infiniopAvgPool1dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor( + infiniopAvgPool1dDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h new file mode 100644 index 000000000..6c9c2a773 --- /dev/null +++ b/include/infiniop/ops/cross_entropy.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_API_H__ +#define __INFINIOP_CROSS_ENTROPY_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc); + +__INFINI_C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( + infiniopCrossEntropyDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopCrossEntropy( + infiniopCrossEntropyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor( + infiniopCrossEntropyDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h new file mode 100644 index 000000000..5476f754c --- /dev/null +++ b/include/infiniop/ops/equal.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__INFINI_C __export infiniStatus_t infiniopGetEqualWorkspaceSize( + infiniopEqualDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor( + infiniopEqualDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..ba5b43b77 --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__INFINI_C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize( + infiniopHardSwishDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor( + infiniopHardSwishDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h new file mode 100644 index 000000000..62f6435aa --- /dev/null +++ b/include/infiniop/ops/hardtanh.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_HARDTANH_API_H__ +#define __INFINIOP_HARDTANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardTanhDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateHardTanhDescriptor(infiniopHandle_t handle, + infiniopHardTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min_val, + float max_val); + +__INFINI_C __export infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 46249178d..6fc3e5c00 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -53,6 +53,8 @@ from infinicore.ops.attention import attention from infinicore.ops.baddbmm import baddbmm from infinicore.ops.bilinear import bilinear +from infinicore.ops.cross_entropy import cross_entropy +from infinicore.ops.equal import equal from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching from infinicore.ops.matmul import matmul @@ -132,11 +134,13 @@ "bilinear", "fmod", "matmul", + "equal", "mul", "narrow", "squeeze", "unsqueeze", "rearrange", + "cross_entropy", "empty", "empty_like", "from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index f81b61262..a8da2dfca 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,7 +1,10 @@ from .adaptive_max_pool1d import adaptive_max_pool1d +from .avg_pool1d import avg_pool1d from .causal_softmax import causal_softmax from .embedding import embedding from .flash_attention import flash_attention +from .hardswish import hardswish +from .hardtanh import hardtanh from .linear import linear from .linear_w8a8i8 import linear_w8a8i8 from .random_sample import random_sample @@ -22,6 +25,9 @@ "RopeAlgo", "rope", "silu", + "hardswish", + "hardtanh", + "avg_pool1d", "swiglu", "linear_w8a8i8", "silu_and_mul", diff --git a/python/infinicore/nn/functional/avg_pool1d.py b/python/infinicore/nn/functional/avg_pool1d.py new file mode 100644 index 000000000..0cf4759ad --- /dev/null +++ b/python/infinicore/nn/functional/avg_pool1d.py @@ -0,0 +1,24 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def avg_pool1d( + input: Tensor, + kernel_size: int, + stride: int | None = None, + padding: int = 0, + *, + out=None, +) -> Tensor: + if stride is None: + stride = 0 + + if out is None: + return Tensor( + _infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding) + ) + + _infinicore.avg_pool1d_( + out._underlying, input._underlying, kernel_size, stride, padding + ) + return out diff --git a/python/infinicore/nn/functional/hardswish.py b/python/infinicore/nn/functional/hardswish.py new file mode 100644 index 000000000..b054b8978 --- /dev/null +++ b/python/infinicore/nn/functional/hardswish.py @@ -0,0 +1,28 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor: + r"""Apply the Hardswish activation function element-wise.""" + + if ( + infinicore.use_ntops + and input.device.type in ("cuda", "musa") + and out is None + and hasattr(infinicore.ntops.torch, "hardswish") + ): + try: + return infinicore.ntops.torch.hardswish(input, inplace=inplace) + except AttributeError: + pass + + if inplace: + _infinicore.hardswish_(input._underlying, input._underlying) + return input + + if out is None: + return Tensor(_infinicore.hardswish(input._underlying)) + + _infinicore.hardswish_(out._underlying, input._underlying) + return out diff --git a/python/infinicore/nn/functional/hardtanh.py b/python/infinicore/nn/functional/hardtanh.py new file mode 100644 index 000000000..925de33d6 --- /dev/null +++ b/python/infinicore/nn/functional/hardtanh.py @@ -0,0 +1,46 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def hardtanh( + input: Tensor, + min_val: float = -1.0, + max_val: float = 1.0, + inplace: bool = False, + *, + out=None, +) -> Tensor: + """Clamp the input tensor to the range [min_val, max_val].""" + + if min_val > max_val: + raise ValueError("min_val must be less than or equal to max_val") + + if ( + infinicore.use_ntops + and input.device.type in ("cuda", "musa") + and out is None + and hasattr(infinicore.ntops.torch, "hardtanh") + ): + try: + return infinicore.ntops.torch.hardtanh( + input, min_val=min_val, max_val=max_val, inplace=inplace + ) + except AttributeError: + pass + + if inplace: + _infinicore.hardtanh_( + input._underlying, input._underlying, float(min_val), float(max_val) + ) + return input + + if out is None: + return Tensor( + _infinicore.hardtanh(input._underlying, float(min_val), float(max_val)) + ) + + _infinicore.hardtanh_( + out._underlying, input._underlying, float(min_val), float(max_val) + ) + return out diff --git a/python/infinicore/ops/cross_entropy.py b/python/infinicore/ops/cross_entropy.py new file mode 100644 index 000000000..5b47697b5 --- /dev/null +++ b/python/infinicore/ops/cross_entropy.py @@ -0,0 +1,33 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def cross_entropy( + logits, + target, + weight=None, + *, + ignore_index=None, + reduction="none", + out=None, +): + """ + Token-wise cross entropy without reduction. The output tensor has the same + shape as target and uses the logits dtype. + """ + if weight is not None: + raise NotImplementedError("class weights are not supported yet.") + if ignore_index is not None: + raise NotImplementedError("ignore_index is not supported yet.") + if reduction not in (None, "none"): + raise NotImplementedError("Only reduction='none' is implemented.") + + if out is None: + return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying)) + + _infinicore.cross_entropy_( + out._underlying, + logits._underlying, + target._underlying, + ) + return out diff --git a/python/infinicore/ops/equal.py b/python/infinicore/ops/equal.py new file mode 100644 index 000000000..5a656ab30 --- /dev/null +++ b/python/infinicore/ops/equal.py @@ -0,0 +1,10 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def equal(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.equal(input._underlying, other._underlying)) + + _infinicore.equal_(out._underlying, input._underlying, other._underlying) + return out diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py index 094b2230e..e0019dc89 100644 --- a/python/infinicore/utils.py +++ b/python/infinicore/utils.py @@ -1,9 +1,13 @@ -import ml_dtypes import numpy as np import torch import infinicore +try: + import ml_dtypes +except ModuleNotFoundError: + ml_dtypes = None + def to_torch_dtype(infini_dtype): """Convert infinicore data type to PyTorch data type""" @@ -57,7 +61,9 @@ def numpy_to_infinicore_dtype(numpy_dtype): return infinicore.float64 elif numpy_dtype == np.float16: return infinicore.float16 - elif numpy_dtype == ml_dtypes.bfloat16: + elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16: + return infinicore.bfloat16 + elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16: return infinicore.bfloat16 elif numpy_dtype == np.int8: return infinicore.int8 @@ -86,6 +92,13 @@ def infinicore_to_numpy_dtype(infini_dtype): elif infini_dtype == infinicore.int16: return np.int16 elif infini_dtype == infinicore.bfloat16: + if hasattr(np, "bfloat16"): + return np.bfloat16 + if ml_dtypes is None: + raise ModuleNotFoundError( + "ml_dtypes is required for bfloat16 numpy conversion. " + "Please install ml_dtypes." + ) return ml_dtypes.bfloat16 elif infini_dtype == infinicore.int32: return np.int32 diff --git a/scripts/python_test.py b/scripts/python_test.py index 0bd8bc26d..13b69a013 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -17,12 +17,12 @@ def run_tests(args): "causal_softmax.py", "clip.py", "conv.py", - #"dequantize_awq.py", + # "dequantize_awq.py", "gelu.py", "gemm.py", - #"layer_norm.py", + # "layer_norm.py", "logsoftmax.py", - #"lp_norm.py", + # "lp_norm.py", "mul.py", "ones.py", "random_sample.py", @@ -31,7 +31,7 @@ def run_tests(args): "rms_norm.py", "rope.py", "sigmoid.py", - #"softmax.py", + # "softmax.py", "softplus.py", "sub.py", "swiglu.py", @@ -42,6 +42,7 @@ def run_tests(args): # "paged_attention.py", # "paged_caching.py", # "paged_attention_prefill.py" + "cross_entropy.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc new file mode 100644 index 000000000..907b25b00 --- /dev/null +++ b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc @@ -0,0 +1,68 @@ +#include "infinicore/ops/avg_pool1d.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &AvgPool1d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void AvgPool1d::execute( + Tensor output, + Tensor input, + size_t kernel_size, + size_t stride, + size_t padding) { + + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + if (stride == 0) { + stride = kernel_size; + } + + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No AvgPool1d implementation for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, kernel_size, stride, padding); +} + +Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) { + if (stride == 0) { + stride = kernel_size; + } + + const auto &shape = input->shape(); + if (shape.size() != 3) { + throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]"); + } + + const size_t n = shape[0]; + const size_t c = shape[1]; + const size_t l_in = shape[2]; + + if (l_in + 2 * padding < kernel_size) { + throw std::runtime_error("AvgPool1d kernel_size is larger than padded length"); + } + + const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1; + + Shape out_shape = {n, c, out_width}; + auto output = Tensor::empty(out_shape, input->dtype(), input->device()); + avg_pool1d_(output, input, kernel_size, stride, padding); + return output; +} + +void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) { + AvgPool1d::execute(output, input, kernel_size, stride, padding); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc new file mode 100644 index 000000000..df7ebda8d --- /dev/null +++ b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/avg_pool1d.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::avg_pool1d_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopAvgPool1dDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate( + Tensor output, + Tensor input, + size_t kernel_size, + size_t stride, + size_t padding) { + + if (stride == 0) { + stride = kernel_size; + } + + size_t seed = hash_combine(output, input, kernel_size, stride, padding); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopAvgPool1dDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + kernel_size, + stride, + padding)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAvgPool1d( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + AvgPool1d::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::avg_pool1d_impl::infiniop diff --git a/src/infinicore/ops/cross_entropy/cross_entropy.cc b/src/infinicore/ops/cross_entropy/cross_entropy.cc new file mode 100644 index 000000000..9804d5377 --- /dev/null +++ b/src/infinicore/ops/cross_entropy/cross_entropy.cc @@ -0,0 +1,45 @@ +#include "infinicore/ops/cross_entropy.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &CrossEntropy::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) { + + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target); + + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, target); +} + +Tensor cross_entropy(Tensor input, Tensor target) { + + Shape shape = target->shape(); + + auto output = Tensor::empty(shape, input->dtype(), input->device()); + + cross_entropy_(output, input, target); + return output; +} + +void cross_entropy_(Tensor output, Tensor input, Tensor target) { + CrossEntropy::execute(output, input, target); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc new file mode 100644 index 000000000..d02f16da6 --- /dev/null +++ b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc @@ -0,0 +1,64 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" + +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/cross_entropy.hpp" + +#include + +namespace infinicore::op::cross_entropy_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopCrossEntropyDescriptor_t &desc) { + if (desc != nullptr) { + + INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor target) { + + size_t seed = hash_combine(output, input, target); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopCrossEntropyDescriptor_t desc = nullptr; + + if (!desc_opt) { + + INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + target->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopCrossEntropy( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + target->data(), + context::getStream())); +} + +static bool registered = []() { + CrossEntropy::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::cross_entropy_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/equal/equal.cc b/src/infinicore/ops/equal/equal.cc new file mode 100644 index 000000000..b6acc4d25 --- /dev/null +++ b/src/infinicore/ops/equal/equal.cc @@ -0,0 +1,31 @@ +#include "infinicore/ops/equal.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Equal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Equal::execute(Tensor out, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b); + infinicore::context::setDevice(out->device()); + dispatcher().lookup(out->device().getType())(out, a, b); +} + +Tensor equal(Tensor a, Tensor b) { + auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device()); + equal_(out, a, b); + return out; +} + +void equal_(Tensor out, Tensor a, Tensor b) { + if (out->dtype() != DataType::BOOL) { + throw std::runtime_error("Equal expects bool output tensor."); + } + Equal::execute(out, a, b); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/equal/equal_infiniop.cc b/src/infinicore/ops/equal/equal_infiniop.cc new file mode 100644 index 000000000..1b4e4cffa --- /dev/null +++ b/src/infinicore/ops/equal/equal_infiniop.cc @@ -0,0 +1,57 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/equal.hpp" +#include + +namespace infinicore::op::equal_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopEqualDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor a, Tensor b) { + size_t seed = hash_combine(out, a, b); + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + infiniopEqualDescriptor_t desc = nullptr; + if (auto cached = cache.get(seed)) { + desc = *cached; + } else { + INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor( + context::getInfiniopHandle(device), &desc, + out->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopEqual( + desc, + workspace_ptr, + workspace_size, + out->data(), + a->data(), + b->data(), + context::getStream())); +} + +static bool registered = []() { + Equal::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::equal_impl::infiniop diff --git a/src/infinicore/ops/hardswish/hardswish.cc b/src/infinicore/ops/hardswish/hardswish.cc new file mode 100644 index 000000000..ec8db75ff --- /dev/null +++ b/src/infinicore/ops/hardswish/hardswish.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/hardswish.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Hardswish::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Hardswish::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No Hardswish implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor hardswish(Tensor input) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + hardswish_(output, input); + return output; +} + +void hardswish_(Tensor output, Tensor input) { + Hardswish::execute(output, input); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/hardswish/hardswish_infiniop.cc b/src/infinicore/ops/hardswish/hardswish_infiniop.cc new file mode 100644 index 000000000..44d4054e8 --- /dev/null +++ b/src/infinicore/ops/hardswish/hardswish_infiniop.cc @@ -0,0 +1,61 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/hardswish.hpp" +#include + +namespace infinicore::op::hardswish_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopHardSwishDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopHardSwishDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopHardSwish( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Hardswish::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::hardswish_impl::infiniop diff --git a/src/infinicore/ops/hardtanh/hardtanh.cc b/src/infinicore/ops/hardtanh/hardtanh.cc new file mode 100644 index 000000000..5a4df2142 --- /dev/null +++ b/src/infinicore/ops/hardtanh/hardtanh.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/hardtanh.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &HardTanh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error( + "No HardTanh implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, min_val, max_val); +} + +Tensor hardtanh(Tensor input, float min_val, float max_val) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + hardtanh_(output, input, min_val, max_val); + return output; +} + +void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) { + HardTanh::execute(output, input, min_val, max_val); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc new file mode 100644 index 000000000..d8af439d8 --- /dev/null +++ b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc @@ -0,0 +1,63 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/hardtanh.hpp" +#include + +namespace infinicore::op::hardtanh_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopHardTanhDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, float min_val, float max_val) { + size_t seed = hash_combine(output, input, min_val, max_val); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopHardTanhDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + min_val, + max_val)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopHardTanh( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + HardTanh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::hardtanh_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 1d0ace555..8a83c02f8 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,12 +7,17 @@ #include "ops/add_rms_norm.hpp" #include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/avg_pool1d.hpp" #include "ops/baddbmm.hpp" #include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" +#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" +#include "ops/equal.hpp" #include "ops/flash_attention.hpp" #include "ops/fmod.hpp" +#include "ops/hardswish.hpp" +#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" @@ -51,18 +56,23 @@ inline void bind(py::module &m) { bind_matmul(m); bind_mul(m); bind_mha_varlen(m); + bind_hardswish(m); + bind_hardtanh(m); bind_paged_attention(m); bind_paged_attention_prefill(m); bind_paged_caching(m); bind_random_sample(m); + bind_cross_entropy(m); bind_rearrange(m); bind_rms_norm(m); + bind_avg_pool1d(m); bind_silu(m); bind_swiglu(m); bind_rope(m); bind_embedding(m); bind_linear_w8a8i8(m); bind_silu_and_mul(m); + bind_equal(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/avg_pool1d.hpp b/src/infinicore/pybind11/ops/avg_pool1d.hpp new file mode 100644 index 000000000..32394552a --- /dev/null +++ b/src/infinicore/pybind11/ops/avg_pool1d.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "infinicore/ops/avg_pool1d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_avg_pool1d(py::module &m) { + m.def( + "avg_pool1d", + [](::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { + return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding); + }, + py::arg("input"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("padding") = 0, + R"doc(AvgPool1d out-of-place.)doc"); + + m.def( + "avg_pool1d_", + [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { + op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding); + }, + py::arg("output"), + py::arg("input"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("padding") = 0, + R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/cross_entropy.hpp b/src/infinicore/pybind11/ops/cross_entropy.hpp new file mode 100644 index 000000000..8105642a6 --- /dev/null +++ b/src/infinicore/pybind11/ops/cross_entropy.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/cross_entropy.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cross_entropy(py::module &m) { + m.def("cross_entropy", + &op::cross_entropy, + py::arg("logits"), + py::arg("target"), + R"doc(Token-wise cross entropy loss without reduction.)doc"); + + m.def("cross_entropy_", + &op::cross_entropy_, + py::arg("loss"), + py::arg("logits"), + py::arg("target"), + R"doc(Write cross entropy loss into a provided tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/equal.hpp b/src/infinicore/pybind11/ops/equal.hpp new file mode 100644 index 000000000..d14a6b61d --- /dev/null +++ b/src/infinicore/pybind11/ops/equal.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/equal.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_equal(py::module &m) { + m.def("equal", + &op::equal, + py::arg("a"), + py::arg("b"), + R"doc(Elementwise equality returning a bool tensor.)doc"); + + m.def("equal_", + &op::equal_, + py::arg("out"), + py::arg("a"), + py::arg("b"), + R"doc(In-place elementwise equality writing into `out`.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardswish.hpp b/src/infinicore/pybind11/ops/hardswish.hpp new file mode 100644 index 000000000..daaccec62 --- /dev/null +++ b/src/infinicore/pybind11/ops/hardswish.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/hardswish.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_hardswish(py::module &m) { + m.def("hardswish", + &op::hardswish, + py::arg("input"), + R"doc(Out-of-place Hardswish activation.)doc"); + + m.def("hardswish_", + &op::hardswish_, + py::arg("output"), + py::arg("input"), + R"doc(In-place Hardswish activation.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardtanh.hpp b/src/infinicore/pybind11/ops/hardtanh.hpp new file mode 100644 index 000000000..ff9abb872 --- /dev/null +++ b/src/infinicore/pybind11/ops/hardtanh.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +#include "infinicore/ops/hardtanh.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_hardtanh(py::module &m) { + m.def("hardtanh", + &op::hardtanh, + py::arg("input"), + py::arg("min_val") = -1.0f, + py::arg("max_val") = 1.0f, + R"doc(Apply the HardTanh activation.)doc"); + + m.def("hardtanh_", + &op::hardtanh_, + py::arg("output"), + py::arg("input"), + py::arg("min_val") = -1.0f, + py::arg("max_val") = 1.0f, + R"doc(In-place HardTanh activation.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/avg_pool1d/avg_pool1d.h b/src/infiniop/ops/avg_pool1d/avg_pool1d.h new file mode 100644 index 000000000..fae5f445b --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/avg_pool1d.h @@ -0,0 +1,103 @@ +#ifndef __AVG_POOL1D_H__ +#define __AVG_POOL1D_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/avg_pool1d.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::avg_pool1d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AvgPool1dInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AvgPool1dInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t kernel_size, \ + size_t stride, \ + size_t padding); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class AvgPool1dInfo { +private: + AvgPool1dInfo() = default; + +public: + infiniDtype_t dtype; + size_t batch, channels, in_width, out_width; + size_t kernel_size, stride, padding; + + ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width; + ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width; + + static utils::Result createAvgPool1dInfo( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = y_desc->dtype(); + CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t batch = x_desc->dim(0); + size_t channels = x_desc->dim(1); + size_t in_width = x_desc->dim(2); + + CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t padded_len = in_width + 2 * padding; + + CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t expected_out_width = (padded_len - kernel_size) / stride + 1; + CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t out_width = expected_out_width; + + return utils::Result(AvgPool1dInfo{ + dtype, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), + x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)}); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc new file mode 100644 index 000000000..67e5b6623 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc @@ -0,0 +1,96 @@ +#include "avg_pool1d_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::avg_pool1d::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info, + T *y, + const T *x) { + const float inv_kernel = 1.0f / static_cast(info.kernel_size); + +#pragma omp parallel for + for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) { + + ptrdiff_t b = bc / info.channels; + ptrdiff_t c = bc % info.channels; + + size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel; + size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel; + + for (size_t ow = 0; ow < info.out_width; ++ow) { + size_t y_offset = y_base + ow * info.y_stride_width; + + long long start_w = static_cast(ow * info.stride) - info.padding; + long long end_w = start_w + info.kernel_size; + + long long valid_start = std::max(0LL, start_w); + long long valid_end = std::min(static_cast(info.in_width), end_w); + + float sum = 0.0f; + for (long long iw = valid_start; iw < valid_end; ++iw) { + size_t x_offset = x_base + iw * info.x_stride_width; + sum += utils::cast(x[x_offset]); + } + + const float avg = sum * inv_kernel; + y[y_offset] = utils::cast(avg); + } + } + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(fp16_t); + case INFINI_DTYPE_BF16: + return CALCULATE(bf16_t); + case INFINI_DTYPE_F32: + return CALCULATE(float); + case INFINI_DTYPE_F64: + return CALCULATE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::cpu diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h new file mode 100644 index 000000000..2335733db --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_CPU_H__ +#define __INFINIOP_AVG_POOL1D_CPU_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh new file mode 100644 index 000000000..36a11acfc --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh @@ -0,0 +1,58 @@ +#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ +#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ + +template +__device__ void avgPool1dKernel( + T *y, + const T *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + + long long start_w = static_cast(ow * stride) - padding; + + T sum = 0; + + for (size_t k = 0; k < kernel_size; ++k) { + long long iw = start_w + k; + + if (iw >= 0 && iw < static_cast(in_width)) { + size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width; + sum += x[x_offset]; + } + } + +#if defined(ENABLE_ILUVATAR_API) + // Iluvatar __half doesn't accept size_t directly. + y[y_offset] = sum / static_cast(static_cast(kernel_size)); +#else + y[y_offset] = sum / static_cast(kernel_size); +#endif + } +} + +#endif diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h new file mode 100644 index 000000000..576da66de --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_METAX_H__ +#define __INFINIOP_AVG_POOL1D_METAX_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(metax) + +#endif // __INFINIOP_AVG_POOL1D_METAX_H__ diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca new file mode 100644 index 000000000..9b3f15b9a --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca @@ -0,0 +1,170 @@ +#include "../../../devices/metax/metax_common.h" +#include "avg_pool1d_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include + +namespace op::avg_pool1d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +__device__ __forceinline__ Tdata castToOutput(Tcompute val) { + if constexpr (std::is_same_v) { + return __float2half(static_cast(val)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(static_cast(val)); + } else { + return static_cast(val); + } +} + +template +INFINIOP_METAX_KERNEL avgPool1dGlobalKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + size_t x_base = b * x_stride_batch + c * x_stride_channel; + + long long start_w = static_cast(ow * stride) - static_cast(padding); + long long end_w = start_w + static_cast(kernel_size); + long long iw_start = start_w < 0 ? 0 : start_w; + long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; + + Tcompute sum = Tcompute(0); + if (iw_start < iw_end) { + size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; + for (long long iw = iw_start; iw < iw_end; ++iw) { + sum += static_cast(x[x_offset]); + x_offset += x_stride_width; + } + } + + y[y_offset] = castToOutput(sum * inv_kernel); + } +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + Tdata *y, + const Tdata *x, + hcStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < block_size) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA, TCOMPUTE) \ + calculateAvgPool1d( \ + _info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (hcStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half, float); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16, float); + case INFINI_DTYPE_F32: + return CALCULATE(float, float); + case INFINI_DTYPE_F64: + return CALCULATE(double, double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::metax diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h new file mode 100644 index 000000000..9034d7358 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h @@ -0,0 +1,72 @@ +#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ +#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ + +#include + +namespace op::avg_pool1d::moore { + +template +__device__ __forceinline__ Tdata castToOutput(Tcompute val) { + if constexpr (std::is_same_v) { + return __float2half(static_cast(val)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(static_cast(val)); + } else { + return static_cast(val); + } +} + +template +__device__ void avgPool1dKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + size_t x_base = b * x_stride_batch + c * x_stride_channel; + + long long start_w = static_cast(ow * stride) - static_cast(padding); + long long end_w = start_w + static_cast(kernel_size); + long long iw_start = start_w < 0 ? 0 : start_w; + long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; + + Tcompute sum = Tcompute(0); + if (iw_start < iw_end) { + size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; + for (long long iw = iw_start; iw < iw_end; ++iw) { + sum += static_cast(x[x_offset]); + x_offset += x_stride_width; + } + } + + y[y_offset] = castToOutput(sum * inv_kernel); + } +} + +} // namespace op::avg_pool1d::moore + +#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h new file mode 100644 index 000000000..604d06012 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__ +#define __INFINIOP_AVG_POOL1D_MOORE_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(moore) + +#endif // __INFINIOP_AVG_POOL1D_MOORE_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu new file mode 100644 index 000000000..518d249b9 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu @@ -0,0 +1,135 @@ +#include "../../../devices/moore/moore_common.h" +#include "avg_pool1d_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "avg_pool1d_kernel.h" + +namespace op::avg_pool1d::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + avgPool1dKernel( + y, x, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_stride_batch, y_stride_channel, y_stride_width, + x_stride_batch, x_stride_channel, x_stride_width); +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + Tdata *y, + const Tdata *x, + musaStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < block_size) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA, TCOMPUTE) \ + calculateAvgPool1d(\ + _info,\ + _opaque->internal->maxThreadsPerBlock(),\ + (TDATA *)y,\ + (const TDATA *)x,\ + (musaStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half, float); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16, float); + case INFINI_DTYPE_F32: + return CALCULATE(float, float); + case INFINI_DTYPE_F64: + return CALCULATE(double, double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::moore diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu new file mode 100644 index 000000000..202d4b8e9 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu @@ -0,0 +1,126 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "avg_pool1d_nvidia.cuh" + +template +__global__ void avgPool1dGlobalKernel( + T *y, + const T *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + avgPool1dKernel( + y, x, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_stride_batch, y_stride_channel, y_stride_width, + x_stride_batch, x_stride_channel, x_stride_width); +} + +namespace op::avg_pool1d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + T *y, + const T *x, + cudaStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < 256) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA) \ + calculateAvgPool1d(_info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (cudaStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16); + case INFINI_DTYPE_F32: + return CALCULATE(float); + case INFINI_DTYPE_F64: + return CALCULATE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh new file mode 100644 index 000000000..629e745d7 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__ +#define __INFINIOP_AVG_POOL1D_CUDA_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/operator.cc b/src/infiniop/ops/avg_pool1d/operator.cc new file mode 100644 index 000000000..c3696daa1 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/operator.cc @@ -0,0 +1,225 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/avg_pool1d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/avg_pool1d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/avg_pool1d_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/avg_pool1d_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/avg_pool1d_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/avg_pool1d_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/avg_pool1d_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/avg_pool1d_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor( + infiniopHandle_t handle, + infiniopAvgPool1dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t kernel_size, + size_t stride, + size_t padding) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::avg_pool1d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + kernel_size, \ + stride, \ + padding) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc, + size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__INFINI_C infiniStatus_t infiniopAvgPool1d( + infiniopAvgPool1dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc new file mode 100644 index 000000000..af0ebc623 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc @@ -0,0 +1,99 @@ +#include "cross_entropy_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include +#include + +namespace op::cross_entropy::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + + info.outer_size = target_desc->numel(); + + info.vocab_size = x_desc->shape().back(); + + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info, + T *y, const T *x, const void *target) { + const Tidx *label = reinterpret_cast(target); + +#pragma omp parallel for + for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) { + const T *row = x + i * info->x_stride; + Tidx idx = label[i]; + + if (idx < 0 || static_cast(idx) >= info->vocab_size) { + y[i] = utils::cast(0.f); + continue; + } + + float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1); + + float sum_exp = 0.f; + for (size_t j = 0; j < info->vocab_size; ++j) { + sum_exp += std::exp(utils::cast(row[j]) - max_val); + } + + float log_term = std::log(sum_exp) + max_val; + float target_logit = utils::cast(row[idx]); + y[i] = utils::cast(log_term - target_logit); + } + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info, + T *y, const T *x, const void *target) { + + if (info->target_dtype == INFINI_DTYPE_I32) { + return cross_entropy_kernel(info, y, x, target); + } else if (info->target_dtype == INFINI_DTYPE_I64) { + return cross_entropy_kernel(info, y, x, target); + } + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target); + case INFINI_DTYPE_BF16: + return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target); + case INFINI_DTYPE_F32: + return dispatch_target_type(&_info, (float *)y, (const float *)x, target); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::cross_entropy::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h new file mode 100644 index 000000000..7417d1d81 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_CPU_H__ +#define __CROSS_ENTROPY_CPU_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cross_entropy.h b/src/infiniop/ops/cross_entropy/cross_entropy.h new file mode 100644 index 000000000..b502823db --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cross_entropy.h @@ -0,0 +1,42 @@ +#ifndef CROSS_ENTROPY_H +#define CROSS_ENTROPY_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::cross_entropy::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + CrossEntropyInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, \ + CrossEntropyInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create(infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t target_desc); \ + infiniStatus_t calculate(void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh new file mode 100644 index 000000000..c048c1233 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh @@ -0,0 +1,80 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../reduce/cuda/reduce.cuh" + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = 0.0f; + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset); + } + + static __shared__ Tcompute shared_sum[32]; + int lane = threadIdx.x % warpSize; + int warp = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp] = thread_sum; + } + __syncthreads(); + + Tcompute block_sum = 0.0f; + if (warp == 0) { + + if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) { + block_sum = shared_sum[lane]; + } + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + block_sum += __shfl_down_sync(0xffffffff, block_sum, offset); + } + } + + if (threadIdx.x == 0) { + Tcompute log_term = logf(block_sum) + max_val; + + Tcompute target_logit = 0.0f; + + if (label >= 0 && static_cast(label) < vocab_size) { + target_logit = static_cast(x[label]); + } else { + + log_term = 0.0f; + } + + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +#endif diff --git a/src/infiniop/ops/cross_entropy/info.h b/src/infiniop/ops/cross_entropy/info.h new file mode 100644 index 000000000..a83afebb8 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/info.h @@ -0,0 +1,17 @@ +#ifndef CROSS_ENTROPY_INFO_H +#define CROSS_ENTROPY_INFO_H +#include "../../../utils.h" +#include "../../tensor.h" +#include + +#include + +struct CrossEntropyInfo { + int dtype; + int target_dtype; + size_t outer_size; + size_t vocab_size; + ptrdiff_t x_stride; +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h new file mode 100644 index 000000000..57bccea91 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_METAX_H__ +#define __CROSS_ENTROPY_METAX_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(metax) + +#endif // __CROSS_ENTROPY_METAX_H__ diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca new file mode 100644 index 000000000..efd791183 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca @@ -0,0 +1,188 @@ +#include "../../../devices/metax/metax_common.h" +#include "cross_entropy_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include + +namespace { + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = Tcompute(0); + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); + + if (threadIdx.x == 0) { + if (label < 0 || static_cast(label) >= vocab_size) { + y_[row_idx] = static_cast(0.0f); + return; + } + Tcompute log_term = logf(block_sum) + max_val; + Tcompute target_logit = static_cast(x[label]); + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +template +INFINIOP_METAX_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +} // namespace + +namespace op::cross_entropy::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + (void)y_desc; + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, hcStream_t stream) { + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto stream = reinterpret_cast(stream_); + int max_threads = _opaque->internal->maxThreadsPerBlock(); + + if (max_threads >= METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else if (max_threads >= METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else { + CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::metax diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h new file mode 100644 index 000000000..6648b0e32 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h @@ -0,0 +1,53 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = Tcompute(0); + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); + + if (threadIdx.x == 0) { + if (label < 0 || static_cast(label) >= vocab_size) { + y_[row_idx] = static_cast(0.0f); + return; + } + Tcompute log_term = logf(block_sum) + max_val; + Tcompute target_logit = static_cast(x[label]); + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h new file mode 100644 index 000000000..454b14617 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_MOORE_H__ +#define __CROSS_ENTROPY_MOORE_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(moore) + +#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu new file mode 100644 index 000000000..2535679dd --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu @@ -0,0 +1,129 @@ +#include "../../../devices/moore/moore_common.h" +#include "cross_entropy_moore.h" + +#include +#include "../../../devices/moore/moore_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "cross_entropy_kernel.h" + +template +INFINIOP_MOORE_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +namespace op::cross_entropy::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + (void)y_desc; + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, musaStream_t stream) { + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::moore diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu new file mode 100644 index 000000000..0ce3f4984 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu @@ -0,0 +1,107 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "cross_entropy_nvidia.cuh" + +template +INFINIOP_CUDA_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +namespace op::cross_entropy::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CrossEntropyInfo info; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + auto internal = reinterpret_cast(handle)->internal(); + + *desc_ptr = new Descriptor( + new Opaque{internal}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, cudaStream_t stream) { + + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + int max_threads = _opaque->internal->maxThreadsPerBlock(); + + if (max_threads >= 1024) { + CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream)); + } else if (max_threads >= 512) { + CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream)); + } else { + CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh new file mode 100644 index 000000000..441e5b8d8 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_NVIDIA_H__ +#define __CROSS_ENTROPY_NVIDIA_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/operator.cc b/src/infiniop/ops/cross_entropy/operator.cc new file mode 100644 index 000000000..75f35fcb7 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/operator.cc @@ -0,0 +1,174 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_cpu.h" +#endif + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/cross_entropy_nvidia.cuh" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/cross_entropy_moore.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cross_entropy_metax.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, x_desc, target_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( + infiniopCrossEntropyDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopCrossEntropy( + infiniopCrossEntropyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, target, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor( + infiniopCrossEntropyDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + DESTROY(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc new file mode 100644 index 000000000..ff8ebe395 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc @@ -0,0 +1,68 @@ +#include +#include + +#include "equal_cpu.h" + +namespace op::equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + if (compute_dtype != b_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::cpu diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h new file mode 100644 index 000000000..fd811f4b0 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.h @@ -0,0 +1,28 @@ +#ifndef __EQUAL_CPU_H__ +#define __EQUAL_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(equal, cpu) + +namespace op::equal::cpu { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + bool operator()(const Tin0 &a, const Tin1 &b) { + if constexpr (std::is_same_v) { + return a == b; + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::cpu + +#endif diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh new file mode 100644 index 000000000..11ad5981e --- /dev/null +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __EQUAL_CUDA_H__ +#define __EQUAL_CUDA_H__ + +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif +#include + +namespace op::equal::cuda { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + static_assert(!std::is_same_v, "half2 is not supported for mixed output dtype"); + } else if constexpr (std::is_same_v) { + return static_cast(__heq(a, b)); + } else { + return static_cast(a == b); + } + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::cuda + +#endif diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h new file mode 100644 index 000000000..6e4cd64b9 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_METAX_API_H__ +#define __EQUAL_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(equal, metax) + +#endif // __EQUAL_METAX_API_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca new file mode 100644 index 000000000..265e5b5a6 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.maca @@ -0,0 +1,69 @@ +#include "equal_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::equal::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/moore/equal_moore.h b/src/infiniop/ops/equal/moore/equal_moore.h new file mode 100644 index 000000000..2fed1bb40 --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_MOORE_API_H__ +#define __EQUAL_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(equal, moore) + +#endif // __EQUAL_MOORE_API_H__ diff --git a/src/infiniop/ops/equal/moore/equal_moore.mu b/src/infiniop/ops/equal/moore/equal_moore.mu new file mode 100644 index 000000000..d0eb8395d --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore.mu @@ -0,0 +1,140 @@ +#include "equal_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "equal_moore_kernel.h" + +namespace op::equal::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + if (!info.isOutputContiguous()) { + return false; + } + const bool *input_contiguous = info.getInputContiguous(); + const bool *input_broadcasted = info.getInputBroadcasted(); + for (size_t i = 0; i < 2; ++i) { + if (!input_contiguous[i] || input_broadcasted[i]) { + return false; + } + } + return true; +} + +template +INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) { + const auto op = op::equal::moore::EqualOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + output[idx] = op.template operator()(a[idx], b[idx]); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + equal_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1])); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (can_use_contiguous_fast_path(_info)) { + size_t numel = _info.getOutputSize(); + switch (_dtype) { + case INFINI_DTYPE_F16: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_I32: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_I64: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(numel, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::equal::moore diff --git a/src/infiniop/ops/equal/moore/equal_moore_kernel.h b/src/infiniop/ops/equal/moore/equal_moore_kernel.h new file mode 100644 index 000000000..a4e32880b --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore_kernel.h @@ -0,0 +1,30 @@ +#ifndef __EQUAL_MOORE_KERNEL_H__ +#define __EQUAL_MOORE_KERNEL_H__ + +#include + +namespace op::equal::moore { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return __half2float(a) == __half2float(b); + } else if constexpr (std::is_same_v) { + return __bfloat162float(a) == __bfloat162float(b); + } else { + return a == b; + } + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::moore + +#endif // __EQUAL_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu new file mode 100644 index 000000000..5bdf92e6c --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu @@ -0,0 +1,137 @@ +#include +#include +#include + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "equal_nvidia.cuh" + +namespace { + +template +INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + op::equal::cuda::EqualOp op{}; + for (; idx < n; idx += stride) { + output[idx] = op.template operator()(a[idx], b[idx]); + } +} + +template +infiniStatus_t launchFastEqualKernel(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + constexpr int block = 256; + int grid = static_cast((numel + block - 1) / block); + grid = std::min(grid, 65535); + auto cuda_stream = reinterpret_cast(stream); + FastEqualKernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1])); + auto err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +namespace op::equal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + bool fast_path = _info.isOutputContiguous(); + if (fast_path) { + const bool *input_contiguous = _info.getInputContiguous(); + const bool *input_broadcasted = _info.getInputBroadcasted(); + for (size_t i = 0; i < 2; ++i) { + fast_path &= input_contiguous[i] && !input_broadcasted[i]; + } + } + + if (fast_path) { + size_t numel = _info.getOutputSize(); + switch (_dtype) { + case INFINI_DTYPE_F16: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_F32: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_I32: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_I64: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_F64: + return launchFastEqualKernel(numel, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh new file mode 100644 index 000000000..96932dc3d --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EQUAL_CUDA_API_H__ +#define __EQUAL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(equal, nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc new file mode 100644 index 000000000..80da07e01 --- /dev/null +++ b/src/infiniop/ops/equal/operator.cc @@ -0,0 +1,201 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/equal_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/equal_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/equal_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/equal_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..f47198580 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,91 @@ +#include "hardswish_cpu.h" + +#include + +namespace op::hardswish::cpu { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info, + void *output, + const std::vector &inputs) { + const T *in = reinterpret_cast(inputs[0]); + T *out = reinterpret_cast(output); + const ptrdiff_t size = static_cast(info.getOutputSize()); + +#pragma omp parallel for if (size > 1024) + for (ptrdiff_t i = 0; i < size; ++i) { + out[i] = HardSwishOp{}(in[i]); + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F16: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F32: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F64: + return launch_contiguous_cpu(_info, output, inputs); + default: + break; + } + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..b853663aa --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,50 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +#include +#include + +namespace op::hardswish::cpu { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + const float x_f = utils::cast(x); + const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f); + const float result = x_f * clamped * (1.0f / 6.0f); + return utils::cast(result); + } +} HardSwishOp; + +typedef struct HardSwishContiguousOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + + T three = static_cast(3); + T zero = static_cast(0); + T six = static_cast(6); + + T scale = static_cast(0.16666667f); + + T val = x + three; + + val = std::max(zero, val); + val = std::min(six, val); + + return x * val * scale; + } +} HardSwishContiguousOp; + +} // namespace op::hardswish::cpu + +#endif diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..21b6a5f8d --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,86 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif + +namespace op::hardswish::cuda { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + + if constexpr (std::is_same_v) { + + const half2 three = __float2half2_rn(3.0f); + const half2 scale = __float2half2_rn(0.16666667f); + + half2 val = __hadd2(x, three); + +#if defined(ENABLE_ILUVATAR_API) + + float2 val_f = __half22float2(val); + val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f); + val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f); + val = __floats2half2_rn(val_f.x, val_f.y); +#else + + const half2 zero = __float2half2_rn(0.0f); + const half2 six = __float2half2_rn(6.0f); + +#if __CUDA_ARCH__ >= 800 + + val = __hmin2(__hmax2(val, zero), six); +#else + + val = __hmax2(val, zero); + val = __hmin2(val, six); +#endif +#endif + + return __hmul2(__hmul2(x, val), scale); + + } + + else if constexpr (std::is_same_v) { + + const float x_f = __bfloat162float(x); + + const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2bfloat16(x_f * val * 0.16666667f); + + } + + else if constexpr (std::is_same_v) { + const float x_f = __half2float(x); + const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2half(x_f * val * 0.16666667f); + + } + + else if constexpr (std::is_same_v) { + + const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); + return x * val * 0.16666667f; + + } + + else if constexpr (std::is_same_v) { + const double val = fmin(fmax(x + 3.0, 0.0), 6.0); + return x * val * (1.0 / 6.0); + } + } +} HardSwishOp; + +} // namespace op::hardswish::cuda + +#endif diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..16b131aa9 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_METAX_API_H__ +#define __HARDSWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..fc57a9b20 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,58 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.h b/src/infiniop/ops/hardswish/moore/hardswish_moore.h new file mode 100644 index 000000000..e5861a158 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_MOORE_API_H__ +#define __HARDSWISH_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, moore) + +#endif // __HARDSWISH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu new file mode 100644 index 000000000..3a1290b35 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu @@ -0,0 +1,118 @@ +#include "hardswish_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "hardswish_moore_kernel.h" + +namespace op::hardswish::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && + info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { + const auto op = op::hardswish::moore::HardSwishOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + out[idx] = op(in[idx]); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + hardswish_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0])); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hardswish::moore diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h new file mode 100644 index 000000000..60e3dbc60 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h @@ -0,0 +1,39 @@ +#ifndef __HARDSWISH_MOORE_KERNEL_H__ +#define __HARDSWISH_MOORE_KERNEL_H__ + +#include +#include + +namespace op::hardswish::moore { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float x_f = __half2float(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2half(x_f * val * 0.16666667f); + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2bfloat16_rn(x_f * val * 0.16666667f); + } else if constexpr (std::is_same_v) { + float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); + return x * val * 0.16666667f; + } else if constexpr (std::is_same_v) { + double val = fmin(fmax(x + 3.0, 0.0), 6.0); + return x * val * (1.0 / 6.0); + } else { + float x_f = static_cast(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return static_cast(x_f * val * 0.16666667f); + } + } +} HardSwishOp; + +} // namespace op::hardswish::moore + +#endif // __HARDSWISH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..f7736a7fd --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,115 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +#include + +namespace op::hardswish::nvidia { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { + const auto op = op::hardswish::cuda::HardSwishOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < numel) { + out[idx] = op(in[idx]); + idx += blockDim.x * gridDim.x; + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int BLOCK_SIZE = 256; + int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + grid = std::min(grid, 65535); + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs[0]); + auto cuda_stream = reinterpret_cast(stream); + + hardswish_contiguous_kernel<<>>(numel, out_ptr, in_ptr); + cudaError_t err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..e544591dc --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..ddce97f16 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/hardswish_moore.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc new file mode 100644 index 000000000..1bd276308 --- /dev/null +++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc @@ -0,0 +1,124 @@ +#include "hardtanh_cpu.h" + +#include + +namespace op::hardtanh::cpu { + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor( + dtype, + info_result.take(), + 0, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +template +static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info, + void *output, + const std::vector &inputs, + float min_val, + float max_val) { + if (inputs.empty()) { + return INFINI_STATUS_BAD_PARAM; + } + + T *out = reinterpret_cast(output); + const T *in = reinterpret_cast(inputs[0]); + const auto ndim = info.getNdim(); + const auto *output_shape = info.getOutputShape(); + const auto *output_strides = info.getOutputStrides(); + const auto *input_shape = info.getInputShape(0); + const auto *input_strides = info.getInputStrides(0); + const auto *input_contiguous = info.getInputContiguous(); + ptrdiff_t output_size = info.getOutputSize(); + +#pragma omp parallel for if (output_size > 1024) + for (ptrdiff_t i = 0; i < output_size; ++i) { + const size_t out_idx = info.isOutputContiguous() + ? static_cast(i) + : op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides); + const size_t in_idx = input_contiguous[0] + ? static_cast(i) + : op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides); + + if constexpr (std::is_same_v || std::is_same_v) { + float value = utils::cast(in[in_idx]); + float clamped = HardTanhOp{}(value, min_val, max_val); + out[out_idx] = utils::cast(clamped); + } else { + out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val); + } + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + (void)workspace; + (void)workspace_size; + (void)stream; + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::hardtanh::cpu diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h new file mode 100644 index 000000000..09bfb340c --- /dev/null +++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h @@ -0,0 +1,63 @@ +#ifndef __HARDTANH_CPU_H__ +#define __HARDTANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +namespace op::hardtanh::cpu { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x, float min_val, float max_val) const { + T low = static_cast(min_val); + T high = static_cast(max_val); + T val = x < low ? low : x; + return val > high ? high : val; + } +} HardTanhOp; + +} // namespace op::hardtanh::cpu + +#endif diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh new file mode 100644 index 000000000..28987f82c --- /dev/null +++ b/src/infiniop/ops/hardtanh/cuda/kernel.cuh @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_CUDA_H__ +#define __HARDTANH_CUDA_H__ + +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif +#include + +namespace op::hardtanh::cuda { + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { + if constexpr (std::is_same_v) { + + float2 x_f2 = __half22float2(x); + x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x)); + x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y)); + return __float22half2_rn(x_f2); + + } else if constexpr (std::is_same_v) { + + float x_f = __bfloat162float(x); + return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f))); + + } else if constexpr (std::is_same_v) { + + float x_f = __half2float(x); + return __float2half(fminf(max_val, fmaxf(min_val, x_f))); + + } else if constexpr (std::is_same_v) { + + return fminf(max_val, fmaxf(min_val, x)); + + } else if constexpr (std::is_same_v) { + + return fmin((double)max_val, fmax((double)min_val, x)); + } + } +} HardTanhOp; + +} // namespace op::hardtanh::cuda + +#endif diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h new file mode 100644 index 000000000..182157116 --- /dev/null +++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h @@ -0,0 +1,48 @@ +#ifndef __HARDTANH_METAX_API_H__ +#define __HARDTANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +namespace op::hardtanh::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; + +} // namespace op::hardtanh::metax + +#endif // __HARDTANH_METAX_API_H__ diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca new file mode 100644 index 000000000..596316e23 --- /dev/null +++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca @@ -0,0 +1,95 @@ +#include "hardtanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardtanh::metax { + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardTanhOp, half>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardTanhOp, float>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardTanhOp, double>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::hardtanh::metax diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h new file mode 100644 index 000000000..470790d52 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_MOORE_API_H__ +#define __HARDTANH_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +namespace op::hardtanh::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::moore::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +} // namespace op::hardtanh::moore + +#endif // __HARDTANH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu new file mode 100644 index 000000000..40e3dbe41 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu @@ -0,0 +1,158 @@ +#include "hardtanh_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "hardtanh_moore_kernel.h" + +namespace op::hardtanh::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && + info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel, + T *out, + const T *in, + float min_val, + float max_val) { + const auto op = op::hardtanh::moore::HardTanhOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + out[idx] = op(in[idx], min_val, max_val); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream, + float min_val, + float max_val) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + hardtanh_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + min_val, + max_val); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::moore::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hardtanh::moore diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h new file mode 100644 index 000000000..db0a3c024 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h @@ -0,0 +1,34 @@ +#ifndef __HARDTANH_MOORE_KERNEL_H__ +#define __HARDTANH_MOORE_KERNEL_H__ + +#include +#include + +namespace op::hardtanh::moore { + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { + if constexpr (std::is_same_v) { + float x_f = __half2float(x); + return __float2half(fminf(max_val, fmaxf(min_val, x_f))); + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f))); + } else if constexpr (std::is_same_v) { + return fminf(max_val, fmaxf(min_val, x)); + } else if constexpr (std::is_same_v) { + return fmin((double)max_val, fmax((double)min_val, x)); + } else { + float x_f = static_cast(x); + return static_cast(fminf(max_val, fmaxf(min_val, x_f))); + } + } +} HardTanhOp; + +} // namespace op::hardtanh::moore + +#endif // __HARDTANH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu new file mode 100644 index 000000000..31ba489ab --- /dev/null +++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu @@ -0,0 +1,150 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardtanh_nvidia.cuh" + +#include + +namespace op::hardtanh::nvidia { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) { + const auto op = op::hardtanh::cuda::HardTanhOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < numel) { + out[idx] = op(in[idx], min_val, max_val); + idx += blockDim.x * gridDim.x; + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream, + float min_val, + float max_val) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int BLOCK_SIZE = 256; + int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + grid = std::min(grid, 65535); + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs[0]); + auto cuda_stream = reinterpret_cast(stream); + + hardtanh_contiguous_kernel<<>>(numel, out_ptr, in_ptr, min_val, max_val); + cudaError_t err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardtanh::nvidia diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh new file mode 100644 index 000000000..ebd27d80e --- /dev/null +++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_CUDA_API_H__ +#define __HARDTANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +namespace op::hardtanh::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +} // namespace op::hardtanh::nvidia + +#endif diff --git a/src/infiniop/ops/hardtanh/operator.cc b/src/infiniop/ops/hardtanh/operator.cc new file mode 100644 index 000000000..f3c782224 --- /dev/null +++ b/src/infiniop/ops/hardtanh/operator.cc @@ -0,0 +1,161 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardtanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardtanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardtanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardtanh_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/hardtanh_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor( + infiniopHandle_t handle, + infiniopHardTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + float min_val, + float max_val) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardtanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}, \ + min_val, \ + max_val) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopHardTanh( + infiniopHardTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h index 05a5c2fca..23be702ff 100644 --- a/src/utils/custom_types.h +++ b/src/utils/custom_types.h @@ -13,6 +13,22 @@ struct CustomBFloat16 { }; typedef struct CustomBFloat16 bf16_t; +inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { + return lhs._v == rhs._v; +} + +inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { + return !(lhs == rhs); +} + +inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { + return lhs._v == rhs._v; +} + +inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { + return !(lhs == rhs); +} + float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); diff --git a/test/infinicore/ops/avg_pool1d.py b/test/infinicore/ops/avg_pool1d.py index 5a0318571..539951628 100644 --- a/test/infinicore/ops/avg_pool1d.py +++ b/test/infinicore/ops/avg_pool1d.py @@ -74,9 +74,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.avg_pool1d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.avg_pool1d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.avg_pool1d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/cross_entropy.py b/test/infinicore/ops/cross_entropy.py index e71a30567..269216bc7 100644 --- a/test/infinicore/ops/cross_entropy.py +++ b/test/infinicore/ops/cross_entropy.py @@ -11,6 +11,8 @@ # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None) # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean') +# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。 +# 仍然保留原始配置,后续实现这些特性时只需放开过滤条件即可。 _TEST_CASES_DATA = [ ((4, 5), (4,), None, False, None), ((8, 10), (8,), None, True, -1), @@ -20,6 +22,9 @@ ((2, 2), (2,), None, True, -100), ] +_SUPPORT_WEIGHT = False +_SUPPORT_IGNORE_INDEX = False + _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, @@ -40,6 +45,11 @@ def parse_test_cases(): ) in _TEST_CASES_DATA: for dtype in _TENSOR_DTYPES: tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + if weight_present and not _SUPPORT_WEIGHT: + continue + if ignore_index is not None and not _SUPPORT_IGNORE_INDEX: + continue + logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype) target = TensorSpec.from_tensor( target_shape, @@ -51,7 +61,7 @@ def parse_test_cases(): ) inputs = [logits, target] - kwargs = {} + kwargs = {"reduction": "none"} if weight_present: weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype) inputs.append(weight_spec) @@ -84,9 +94,10 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.cross_entropy(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.cross_entropy(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation.""" + out = kwargs.pop("out", None) + return infinicore.cross_entropy(*args, out=out, **kwargs) def main(): diff --git a/test/infinicore/ops/equal.py b/test/infinicore/ops/equal.py index 10aae3fcb..fd5c37261 100644 --- a/test/infinicore/ops/equal.py +++ b/test/infinicore/ops/equal.py @@ -74,8 +74,11 @@ def parse_test_cases(): ) ) - # in-place a - if a_supports_inplace: + # Equal 结果为 bool,无法安全复用浮点/整型输入作为输出缓冲区。 + # 只有当输入 dtype 本身为 bool 时才允许 inplace,这里提前留出开关。 + allow_input_inplace = dtype == infinicore.bool + + if allow_input_inplace and a_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -87,8 +90,7 @@ def parse_test_cases(): ) ) - # in-place b - if b_supports_inplace: + if allow_input_inplace and b_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -115,9 +117,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.eq(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.eq(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.equal(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardswish.py b/test/infinicore/ops/hardswish.py index 9f31cdc62..5ab38d594 100644 --- a/test/infinicore/ops/hardswish.py +++ b/test/infinicore/ops/hardswish.py @@ -70,9 +70,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardswish(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.hardswish(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.hardswish(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardtanh.py b/test/infinicore/ops/hardtanh.py index 6861e464e..a88ea6c8d 100644 --- a/test/infinicore/ops/hardtanh.py +++ b/test/infinicore/ops/hardtanh.py @@ -17,7 +17,6 @@ _TEST_CASES_DATA = [ ((13, 4), None, -1.0, 1.0), - ((13, 4), (10, 1), -0.5, 0.5), ((8, 8, 8), None, -2.0, 2.0), ] @@ -87,9 +86,11 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardtanh(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.hardtanh(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation.""" + import infinicore.nn.functional as F + + return F.hardtanh(*args, **kwargs) def main(): diff --git a/test/infiniop/avg_pool1d.py b/test/infiniop/avg_pool1d.py new file mode 100644 index 000000000..dd9e771c0 --- /dev/null +++ b/test/infiniop/avg_pool1d.py @@ -0,0 +1,183 @@ +import ctypes +from ctypes import c_uint64 + +import torch + +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES = [ + # input_shape, x_stride, y_stride, kernel_size, stride, padding + ((2, 3, 16), None, None, 3, None, 0), + ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2), + ((2, 1, 32), None, (32, 16, 1), 2, 2, 0), + ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1), + ((4, 6, 31), None, None, 4, 2, 1), + ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def _effective_stride(stride, kernel_size): + if stride in (None, 0): + return kernel_size + return stride + + +def _compute_output_shape(input_shape, kernel_size, stride, padding): + stride = _effective_stride(stride, kernel_size) + width = input_shape[2] + out_width = (width + 2 * padding - kernel_size) // stride + 1 + return (input_shape[0], input_shape[1], out_width) + + +def avg_pool1d_ref(x, kernel_size, stride, padding): + stride = _effective_stride(stride, kernel_size) + out = torch.nn.functional.avg_pool1d( + x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding + ) + return out.to(x.dtype) + + +def test( + handle, + device, + input_shape, + x_stride, + y_stride, + kernel_size, + stride, + padding, + dtype=InfiniDtype.F16, + sync=None, +): + stride_value = _effective_stride(stride, kernel_size) + out_shape = _compute_output_shape( + input_shape, kernel_size, stride_value, padding + ) + print( + f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, " + f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, " + f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(input_shape, x_stride, dtype, device) + y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros") + + ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPool1dDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + kernel_size, + stride_value, + padding, + ) + ) + + # Invalidate descriptors in tensors after creation to make sure kernels read from arguments + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_avg_pool1d(): + check_error( + LIBINFINIOP.infiniopAvgPool1d( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_avg_pool1d() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation( + "PyTorch", + lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_avg_pool1d(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") + diff --git a/test/infiniop/cross_entropy.py b/test/infiniop/cross_entropy.py new file mode 100644 index 000000000..987f2d11a --- /dev/null +++ b/test/infiniop/cross_entropy.py @@ -0,0 +1,106 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ------------------------------------------------------------ +# 用例配置 +# ------------------------------------------------------------ +_TEST_CASES_ = [ + ((2, 4, 10), None, None), # logits shape, x_stride, y_stride + ((1, 128, 32000), None, None), + ((4, 512, 1000), None, None), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, +} + +# ------------------------------------------------------------ +# PyTorch 参考实现 +# ------------------------------------------------------------ +def cross_entropy_ref(logits, target): + vocab = logits.shape[-1] + logits_flat = logits.reshape(-1, vocab).float() + target_flat = target.reshape(-1).long() + loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none") + return loss.view(target.shape).to(logits.dtype) + + +def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None): + logits_shape = shape + label_shape = shape[:-1] + vocab = shape[-1] + + print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}") + + x = TestTensor(logits_shape, x_stride, dtype, device) + target = TestTensor(label_shape, None, InfiniDtype.I64, device) + + # 生成有效标签 + tgt = target.torch_tensor() + tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device)) + target.actual_tensor().copy_(tgt) + + reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor()) + y = TestTensor(label_shape, y_stride, dtype, device) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor + ) + ) + + for tensor in [x, y, target]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, x.device) + + def run(): + check_error( + LIBINFINIOP.infiniopCrossEntropy( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + target.data(), + None, + ) + ) + + run() + if sync: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol) + + check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py new file mode 100644 index 000000000..e333b94b3 --- /dev/null +++ b/test/infiniop/equal.py @@ -0,0 +1,181 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool,内存大小不同) +class Inplace(Enum): + OUT_OF_PLACE = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 测试的输入数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] + +# 容差设置 (对于 Bool 比较,通常要求完全匹配) +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# PyTorch 标准实现 +def equal_func(c, a, b): + torch.eq(a, b, out=c) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 输入 Tensor 使用指定的 dtype (如 float16) + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + # [关键修改] 输出 Tensor 强制使用 Bool 类型 + # 注意:这里 c_stride 如果是按字节计算的,对于 Bool 类型通常是 1 byte + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device) + + if c.is_broadcast(): + return + + print( + f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL" + ) + + # 运行 PyTorch 对照组 + equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + # [关键修改] 调用 Equal 的 Create 函数 + check_error( + LIBINFINIOP.infiniopCreateEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # Output (Bool) + a.descriptor, # Input A + b.descriptor, # Input B + ) + ) + + # Invalidate descriptors + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + # 使用 Bool 类型的容差 (实际上就是全等) + atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL) + + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # 验证结果 + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..f805b8aad --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,171 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# 复用相同的测试用例配置,因为 HardSwish 也是逐元素操作 +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}" + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + new_output = torch.nn.functional.hardswish(input.torch_tensor()) + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + check_error( + LIBINFINIOP.infiniopCreateHardSwishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardSwishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardSwish( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/hardtanh.py b/test/infiniop/hardtanh.py new file mode 100644 index 000000000..573ba9485 --- /dev/null +++ b/test/infiniop/hardtanh.py @@ -0,0 +1,169 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((16, 5632), None, None), + ((4, 4, 5632), None, None), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# HardTanh 特有的参数测试组合 (min_val, max_val) +_PARAM_CASES = [ + (-1.0, 1.0), + (0.0, 6.0), # 类似于 ReLU6 + (-2.5, 2.5), +] + +# 组合所有测试用例:shape + inplace + params +_TEST_CASES = [ + test_case + (inplace_item, p_min, p_max) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE + for p_min, p_max in _PARAM_CASES +] + +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + min_val=-1.0, + max_val=1.0, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]" + ) + + # 计算 PyTorch 真值 + new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val) + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + check_error( + LIBINFINIOP.infiniopCreateHardTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + c_float(min_val), + c_float(max_val), + ) + ) + + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardtanh(): + check_error( + LIBINFINIOP.infiniopHardTanh( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_hardtanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + if PROFILE: + profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mHardTanh Test passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 275689e78..8aeba0100 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -54,6 +54,54 @@ def add_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def equal_(lib): + # ========================================================= + # 1. 注册 Create 函数 + # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc) + # ========================================================= + lib.infiniopCreateEqualDescriptor.restype = c_int32 + lib.infiniopCreateEqualDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出) + infiniopTensorDescriptor_t, # output (c) + infiniopTensorDescriptor_t, # input_a + infiniopTensorDescriptor_t, # input_b + ] + + # ========================================================= + # 2. 注册 GetWorkspaceSize 函数 + # C函数签名: (desc, &size) + # ========================================================= + lib.infiniopGetEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # ========================================================= + # 3. 注册 Execute (计算) 函数 + # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream) + # ========================================================= + lib.infiniopEqual.restype = c_int32 + lib.infiniopEqual.argtypes = [ + infiniopOperatorDescriptor_t, # desc + c_void_p, # workspace ptr + c_size_t, # workspace size + c_void_p, # output data ptr + c_void_p, # input a data ptr + c_void_p, # input b data ptr + c_void_p, # stream + ] + + # ========================================================= + # 4. 注册 Destroy 函数 + # C函数签名: (desc) + # ========================================================= + lib.infiniopDestroyEqualDescriptor.restype = c_int32 + lib.infiniopDestroyEqualDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def attention_(lib): @@ -162,6 +210,40 @@ def clip_(lib): ] +@OpRegister.operator +def cross_entropy_(lib): + lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCrossEntropy.restype = c_int32 + lib.infiniopCrossEntropy.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def logsoftmax_(lib): lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32 @@ -909,6 +991,112 @@ def silu_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def hardtanh_(lib): + # 1. Create Descriptor - 注意增加了两个 c_float 参数 + lib.infiniopCreateHardTanhDescriptor.restype = c_int32 + lib.infiniopCreateHardTanhDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t), # desc_ptr + infiniopTensorDescriptor_t, # output + infiniopTensorDescriptor_t, # input + c_float, # min_val + c_float, # max_val + ] + + # 2. Get Workspace Size + lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetHardTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # desc + POINTER(c_size_t), # size + ] + + # 3. Execute Operator + lib.infiniopHardTanh.restype = c_int32 + lib.infiniopHardTanh.argtypes = [ + infiniopOperatorDescriptor_t, # desc + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + # 4. Destroy Descriptor + lib.infiniopDestroyHardTanhDescriptor.restype = c_int32 + lib.infiniopDestroyHardTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, # desc + ] + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardSwishDescriptor.restype = c_int32 + lib.infiniopCreateHardSwishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardSwishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHardSwish.restype = c_int32 + lib.infiniopHardSwish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHardSwishDescriptor.restype = c_int32 + lib.infiniopDestroyHardSwishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def avg_pool1d_(lib): + # 1. Create 函数 + # C签名: (handle, *desc, y, x, kernel_size, stride, padding) + lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32 + lib.infiniopCreateAvgPool1dDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # y_desc (Output) + infiniopTensorDescriptor_t, # x_desc (Input) + c_size_t, # kernel_size + c_size_t, # stride + c_size_t, # padding + ] + + # 2. GetWorkspaceSize 函数 + lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # 3. Execute 函数 + lib.infiniopAvgPool1d.restype = c_int32 + lib.infiniopAvgPool1d.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # y (output pointer) + c_void_p, # x (input pointer) + c_void_p, # stream + ] + + # 4. Destroy 函数 + lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def layer_norm_(lib): diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index ec8763a4e..b690e74d4 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -83,8 +83,12 @@ def __init__( InfiniDtype.BYTE, InfiniDtype.BOOL, ]: - randint_low = -2000000000 if randint_low is None else randint_low - randint_high = 2000000000 if randint_high is None else randint_high + if dt == InfiniDtype.BOOL: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2 if randint_high is None else randint_high + else: + randint_low = -2000000000 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high self._torch_tensor = torch.randint( randint_low, randint_high,