From f7c5939980170e30dc0332cd62dfaa4802d68f1f Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 6 Mar 2026 07:30:28 +0000 Subject: [PATCH 1/6] issue/1031 T1-1-9 --- include/infinicore/ops.hpp | 5 + .../infinicore/ops/adaptive_max_pool1d.hpp | 16 ++ include/infinicore/ops/asinh.hpp | 16 ++ include/infinicore/ops/baddbmm.hpp | 15 ++ include/infinicore/ops/bilinear.hpp | 12 ++ include/infinicore/ops/fmod.hpp | 16 ++ include/infiniop.h | 3 + include/infiniop/ops/adaptive_max_pool1d.h | 22 +++ include/infiniop/ops/asinh.h | 24 +++ include/infiniop/ops/fmod.h | 26 +++ python/infinicore/__init__.py | 8 + python/infinicore/nn/functional/__init__.py | 2 + .../nn/functional/adaptive_max_pool1d.py | 39 +++++ python/infinicore/ops/asinh.py | 11 ++ python/infinicore/ops/baddbmm.py | 25 +++ python/infinicore/ops/bilinear.py | 23 +++ python/infinicore/ops/fmod.py | 11 ++ .../adaptive_max_pool1d.cc | 30 ++++ .../adaptive_max_pool1d_infiniop.cc | 52 ++++++ src/infinicore/ops/asinh/asinh.cc | 27 ++++ src/infinicore/ops/asinh/asinh_infiniop.cc | 52 ++++++ src/infinicore/ops/baddbmm/baddbmm.cc | 100 ++++++++++++ src/infinicore/ops/bilinear/bilinear.cc | 119 ++++++++++++++ src/infinicore/ops/fmod/fmod.cc | 28 ++++ src/infinicore/ops/fmod/fmod_infiniop.cc | 52 ++++++ src/infinicore/pybind11/ops.hpp | 11 ++ .../pybind11/ops/adaptive_max_pool1d.hpp | 39 +++++ src/infinicore/pybind11/ops/asinh.hpp | 24 +++ src/infinicore/pybind11/ops/baddbmm.hpp | 56 +++++++ src/infinicore/pybind11/ops/bilinear.hpp | 61 +++++++ src/infinicore/pybind11/ops/fmod.hpp | 26 +++ .../adaptive_max_pool1d/adaptive_max_pool1d.h | 47 ++++++ .../cpu/adaptive_max_pool1d_cpu.cc | 98 +++++++++++ .../cpu/adaptive_max_pool1d_cpu.h | 8 + .../ops/adaptive_max_pool1d/cuda/kernel.cuh | 54 +++++++ src/infiniop/ops/adaptive_max_pool1d/info.h | 65 ++++++++ .../metax/adaptive_max_pool1d_metax.cuh | 8 + .../metax/adaptive_max_pool1d_metax.maca | 130 +++++++++++++++ .../moore/adaptive_max_pool1d_moore.h | 8 + .../moore/adaptive_max_pool1d_moore.mu | 144 +++++++++++++++++ .../nvidia/adaptive_max_pool1d_nvidia.cu | 144 +++++++++++++++++ .../nvidia/adaptive_max_pool1d_nvidia.cuh | 8 + .../ops/adaptive_max_pool1d/operator.cc | 147 +++++++++++++++++ src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 50 ++++++ src/infiniop/ops/asinh/cpu/asinh_cpu.h | 22 +++ src/infiniop/ops/asinh/cuda/kernel.cuh | 29 ++++ src/infiniop/ops/asinh/metax/asinh.maca | 58 +++++++ src/infiniop/ops/asinh/metax/asinh_metax.h | 8 + src/infiniop/ops/asinh/moore/asinh_moore.h | 8 + src/infiniop/ops/asinh/moore/asinh_moore.mu | 59 +++++++ src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 56 +++++++ .../ops/asinh/nvidia/asinh_nvidia.cuh | 8 + src/infiniop/ops/asinh/operator.cc | 141 ++++++++++++++++ src/infiniop/ops/fmod/cpu/fmod_cpu.cc | 53 ++++++ src/infiniop/ops/fmod/cpu/fmod_cpu.h | 19 +++ src/infiniop/ops/fmod/cuda/kernel.cuh | 48 ++++++ src/infiniop/ops/fmod/metax/fmod_metax.h | 8 + src/infiniop/ops/fmod/metax/mul_metax.maca | 61 +++++++ src/infiniop/ops/fmod/moore/fmod_moore.h | 8 + src/infiniop/ops/fmod/moore/fmod_moore.mu | 63 ++++++++ src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu | 59 +++++++ src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh | 8 + src/infiniop/ops/fmod/operator.cc | 152 ++++++++++++++++++ src/infiniop/ops/gemm/cpu/gemm_cpu.cc | 6 +- src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu | 11 +- test/infinicore/ops/adaptive_max_pool1d.py | 6 +- test/infinicore/ops/asinh.py | 6 +- test/infinicore/ops/baddbmm.py | 6 +- test/infinicore/ops/bilinear.py | 15 +- test/infinicore/ops/fmod.py | 6 +- 70 files changed, 2768 insertions(+), 18 deletions(-) create mode 100644 include/infinicore/ops/adaptive_max_pool1d.hpp create mode 100644 include/infinicore/ops/asinh.hpp create mode 100644 include/infinicore/ops/baddbmm.hpp create mode 100644 include/infinicore/ops/bilinear.hpp create mode 100644 include/infinicore/ops/fmod.hpp create mode 100644 include/infiniop/ops/adaptive_max_pool1d.h create mode 100644 include/infiniop/ops/asinh.h create mode 100644 include/infiniop/ops/fmod.h create mode 100644 python/infinicore/nn/functional/adaptive_max_pool1d.py create mode 100644 python/infinicore/ops/asinh.py create mode 100644 python/infinicore/ops/baddbmm.py create mode 100644 python/infinicore/ops/bilinear.py create mode 100644 python/infinicore/ops/fmod.py create mode 100644 src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d.cc create mode 100644 src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d_infiniop.cc create mode 100644 src/infinicore/ops/asinh/asinh.cc create mode 100644 src/infinicore/ops/asinh/asinh_infiniop.cc create mode 100644 src/infinicore/ops/baddbmm/baddbmm.cc create mode 100644 src/infinicore/ops/bilinear/bilinear.cc create mode 100644 src/infinicore/ops/fmod/fmod.cc create mode 100644 src/infinicore/ops/fmod/fmod_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp create mode 100644 src/infinicore/pybind11/ops/asinh.hpp create mode 100644 src/infinicore/pybind11/ops/baddbmm.hpp create mode 100644 src/infinicore/pybind11/ops/bilinear.hpp create mode 100644 src/infinicore/pybind11/ops/fmod.hpp create mode 100644 src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h create mode 100644 src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc create mode 100644 src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h create mode 100644 src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh create mode 100644 src/infiniop/ops/adaptive_max_pool1d/info.h create mode 100644 src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh create mode 100644 src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca create mode 100644 src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h create mode 100644 src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu create mode 100644 src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu create mode 100644 src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh create mode 100644 src/infiniop/ops/adaptive_max_pool1d/operator.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.h create mode 100644 src/infiniop/ops/asinh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asinh/metax/asinh.maca create mode 100644 src/infiniop/ops/asinh/metax/asinh_metax.h create mode 100644 src/infiniop/ops/asinh/moore/asinh_moore.h create mode 100644 src/infiniop/ops/asinh/moore/asinh_moore.mu create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh create mode 100644 src/infiniop/ops/asinh/operator.cc create mode 100644 src/infiniop/ops/fmod/cpu/fmod_cpu.cc create mode 100644 src/infiniop/ops/fmod/cpu/fmod_cpu.h create mode 100644 src/infiniop/ops/fmod/cuda/kernel.cuh create mode 100644 src/infiniop/ops/fmod/metax/fmod_metax.h create mode 100644 src/infiniop/ops/fmod/metax/mul_metax.maca create mode 100644 src/infiniop/ops/fmod/moore/fmod_moore.h create mode 100644 src/infiniop/ops/fmod/moore/fmod_moore.mu create mode 100644 src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu create mode 100644 src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh create mode 100644 src/infiniop/ops/fmod/operator.cc diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 53b3a2f10..5274dde48 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -1,11 +1,16 @@ #pragma once +#include "ops/adaptive_max_pool1d.hpp" #include "ops/add.hpp" #include "ops/add_rms_norm.hpp" +#include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/baddbmm.hpp" +#include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" +#include "ops/fmod.hpp" #include "ops/kv_caching.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" diff --git a/include/infinicore/ops/adaptive_max_pool1d.hpp b/include/infinicore/ops/adaptive_max_pool1d.hpp new file mode 100644 index 000000000..05e49b490 --- /dev/null +++ b/include/infinicore/ops/adaptive_max_pool1d.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class AdaptiveMaxPool1d { +public: + using schema = void (*)(Tensor, Tensor, size_t); + static void execute(Tensor y, Tensor x, size_t output_size); + static common::OpDispatcher &dispatcher(); +}; + +Tensor adaptive_max_pool1d(Tensor x, size_t output_size); +void adaptive_max_pool1d_(Tensor y, Tensor x, size_t output_size); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/asinh.hpp b/include/infinicore/ops/asinh.hpp new file mode 100644 index 000000000..505eb97d9 --- /dev/null +++ b/include/infinicore/ops/asinh.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Asinh { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; + +Tensor asinh(Tensor x); +void asinh_(Tensor y, Tensor x); +} // namespace infinicore::op diff --git a/include/infinicore/ops/baddbmm.hpp b/include/infinicore/ops/baddbmm.hpp new file mode 100644 index 000000000..3c08b98d9 --- /dev/null +++ b/include/infinicore/ops/baddbmm.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2, + float beta = 1.0f, + float alpha = 1.0f); +void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, + float beta = 1.0f, + float alpha = 1.0f); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/bilinear.hpp b/include/infinicore/ops/bilinear.hpp new file mode 100644 index 000000000..3f5f44aac --- /dev/null +++ b/include/infinicore/ops/bilinear.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional bias); +void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional bias); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/fmod.hpp b/include/infinicore/ops/fmod.hpp new file mode 100644 index 000000000..87b90d515 --- /dev/null +++ b/include/infinicore/ops/fmod.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Fmod { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor fmod(Tensor a, Tensor b); +void fmod_(Tensor c, Tensor a, Tensor b); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 11d42c1d1..4217183f7 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,8 +2,10 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" +#include "infiniop/ops/adaptive_max_pool1d.h" #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" +#include "infiniop/ops/asinh.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" @@ -11,6 +13,7 @@ #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/flash_attention.h" +#include "infiniop/ops/fmod.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/int8_gemm.h" diff --git a/include/infiniop/ops/adaptive_max_pool1d.h b/include/infiniop/ops/adaptive_max_pool1d.h new file mode 100644 index 000000000..484413e21 --- /dev/null +++ b/include/infiniop/ops/adaptive_max_pool1d.h @@ -0,0 +1,22 @@ +#ifndef __INFINIOP_ADAPTIVE_MAX_POOL1D_H__ +#define __INFINIOP_ADAPTIVE_MAX_POOL1D_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAdaptiveMaxPool1dDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveMaxPool1dDescriptor_t *desc, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size); + +__C __export infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(infiniopAdaptiveMaxPool1dDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAdaptiveMaxPool1d(infiniopAdaptiveMaxPool1dDescriptor_t desc, void *workspace, size_t workspace_size, + void *y, const void *x, void *stream); + +__C __export infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(infiniopAdaptiveMaxPool1dDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h new file mode 100644 index 000000000..98cfa3a0e --- /dev/null +++ b/include/infiniop/ops/asinh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASINH_API_H__ +#define __INFINIOP_ASINH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/fmod.h b/include/infiniop/ops/fmod.h new file mode 100644 index 000000000..f2a64ecf9 --- /dev/null +++ b/include/infiniop/ops/fmod.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_FMOD_API_H__ +#define __INFINIOP_FMOD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFmodDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFmodDescriptor(infiniopHandle_t handle, + infiniopFmodDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFmod(infiniopFmodDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 2f0ef56ea..5765db719 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -51,6 +51,7 @@ from infinicore.ops.add_rms_norm import add_rms_norm from infinicore.ops.attention import attention from infinicore.ops.kv_caching import kv_caching +from infinicore.ops.asinh import asinh from infinicore.ops.matmul import matmul from infinicore.ops.mha_varlen import mha_varlen from infinicore.ops.mul import mul @@ -61,6 +62,9 @@ from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze +from infinicore.ops.baddbmm import baddbmm +from infinicore.ops.bilinear import bilinear +from infinicore.ops.fmod import fmod from infinicore.tensor import ( Tensor, empty, @@ -123,6 +127,10 @@ "add_rms_norm_", "attention", "kv_caching", + "asinh", + "baddbmm", + "bilinear", + "fmod", "matmul", "mul", "narrow", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 934930d56..f81b61262 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,3 +1,4 @@ +from .adaptive_max_pool1d import adaptive_max_pool1d from .causal_softmax import causal_softmax from .embedding import embedding from .flash_attention import flash_attention @@ -11,6 +12,7 @@ from .swiglu import swiglu __all__ = [ + "adaptive_max_pool1d", "causal_softmax", "embedding", "flash_attention", diff --git a/python/infinicore/nn/functional/adaptive_max_pool1d.py b/python/infinicore/nn/functional/adaptive_max_pool1d.py new file mode 100644 index 000000000..74a8c56e9 --- /dev/null +++ b/python/infinicore/nn/functional/adaptive_max_pool1d.py @@ -0,0 +1,39 @@ +from typing import List + +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def adaptive_max_pool1d( + input: Tensor, + output_size: int, + *, + out=None, +) -> Tensor: + r"""Applies a 1D adaptive max pooling over an input signal composed of + several input planes. + + The output size is H_out. The algorithm used is fairly simple: + + .. math:: + \text{start} = \left\lfloor \frac{i \cdot L_{in}}{L_{out}} \right\rfloor + + \text{end} = \left\lceil \frac{(i + 1) \cdot L_{in}}{L_{out}} \right\rceil + + where :math:`L_{in}` is the size of the input dimension, and :math:`L_{out}` is the size of the output dimension. + + Args: + input (Tensor): Input tensor of shape (N, C, L_in) + output_size (int): The target output size (L_out) + out (Tensor, optional): Output tensor. + + Returns: + Tensor: The result of the adaptive max pooling operation. + """ + + if out is None: + return Tensor(_infinicore.adaptive_max_pool1d(input._underlying, output_size)) + + _infinicore.adaptive_max_pool1d_(out._underlying, input._underlying, output_size) + + return out diff --git a/python/infinicore/ops/asinh.py b/python/infinicore/ops/asinh.py new file mode 100644 index 000000000..05ec58779 --- /dev/null +++ b/python/infinicore/ops/asinh.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def asinh(input, *, out=None): + if out is None: + return Tensor(_infinicore.asinh(input._underlying)) + + _infinicore.asinh_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/baddbmm.py b/python/infinicore/ops/baddbmm.py new file mode 100644 index 000000000..4a34cbb64 --- /dev/null +++ b/python/infinicore/ops/baddbmm.py @@ -0,0 +1,25 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def baddbmm(input, batch1, batch2, *, beta=1.0, alpha=1.0, out=None): + if out is None: + return Tensor( + _infinicore.baddbmm( + input._underlying, + batch1._underlying, + batch2._underlying, + float(beta), + float(alpha), + ) + ) + _infinicore.baddbmm_( + out._underlying, + input._underlying, + batch1._underlying, + batch2._underlying, + float(beta), + float(alpha), + ) + + return out diff --git a/python/infinicore/ops/bilinear.py b/python/infinicore/ops/bilinear.py new file mode 100644 index 000000000..4773dd825 --- /dev/null +++ b/python/infinicore/ops/bilinear.py @@ -0,0 +1,23 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def bilinear(input1, input2, weight, bias=None, *, out=None): + if out is None: + return Tensor( + _infinicore.bilinear( + input1._underlying, + input2._underlying, + weight._underlying, + bias._underlying if bias is not None else None, + ) + ) + _infinicore.bilinear_( + out._underlying, + input1._underlying, + input2._underlying, + weight._underlying, + bias._underlying if bias is not None else None, + ) + + return out diff --git a/python/infinicore/ops/fmod.py b/python/infinicore/ops/fmod.py new file mode 100644 index 000000000..e52be82cb --- /dev/null +++ b/python/infinicore/ops/fmod.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def fmod(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.fmod(input._underlying, other._underlying)) + + _infinicore.fmod_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d.cc b/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d.cc new file mode 100644 index 000000000..bd80b0771 --- /dev/null +++ b/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d.cc @@ -0,0 +1,30 @@ +#include "infinicore/ops/adaptive_max_pool1d.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &AdaptiveMaxPool1d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void AdaptiveMaxPool1d::execute(Tensor y, Tensor x, size_t output_size) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x, output_size); +} + +Tensor adaptive_max_pool1d(Tensor x, size_t output_size) { + infinicore::Shape y_shape = x->shape(); + y_shape.back() = output_size; + auto y = Tensor::empty(y_shape, x->dtype(), x->device()); + adaptive_max_pool1d_(y, x, output_size); + return y; +} + +void adaptive_max_pool1d_(Tensor y, Tensor x, size_t output_size) { + AdaptiveMaxPool1d::execute(y, x, output_size); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d_infiniop.cc b/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d_infiniop.cc new file mode 100644 index 000000000..451489e15 --- /dev/null +++ b/src/infinicore/ops/adaptive_max_pool1d/adaptive_max_pool1d_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/adaptive_max_pool1d.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::adaptive_max_pool1d_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAdaptiveMaxPool1dDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveMaxPool1dDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x, size_t out) { + size_t seed = hash_combine(y, x, out); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAdaptiveMaxPool1dDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveMaxPool1dDescriptor( + context::getInfiniopHandle(y->device()), &desc, + y->desc(), x->desc(), out)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAdaptiveMaxPool1dWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAdaptiveMaxPool1d( + desc, workspace->data(), workspace_size, + y->data(), x->data(), context::getStream())); +} + +static bool registered = []() { + AdaptiveMaxPool1d::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::adaptive_max_pool1d_impl::infiniop diff --git a/src/infinicore/ops/asinh/asinh.cc b/src/infinicore/ops/asinh/asinh.cc new file mode 100644 index 000000000..fbf131d99 --- /dev/null +++ b/src/infinicore/ops/asinh/asinh.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/asinh.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Asinh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Asinh::execute(Tensor y, Tensor x) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x); +} + +Tensor asinh(Tensor x) { + auto y = Tensor::empty(x->shape(), x->dtype(), x->device()); + asinh_(y, x); + return y; +} + +void asinh_(Tensor y, Tensor x) { + Asinh::execute(y, x); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/asinh/asinh_infiniop.cc b/src/infinicore/ops/asinh/asinh_infiniop.cc new file mode 100644 index 000000000..ceed8d5a2 --- /dev/null +++ b/src/infinicore/ops/asinh/asinh_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/asinh.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::asinh_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAsinhDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAsinhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x) { + size_t seed = hash_combine(y, x); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAsinhDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAsinhDescriptor( + context::getInfiniopHandle(y->device()), &desc, + y->desc(), x->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAsinhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAsinh( + desc, workspace->data(), workspace_size, + y->data(), x->data(), context::getStream())); +} + +static bool registered = []() { + Asinh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::asinh_impl::infiniop diff --git a/src/infinicore/ops/baddbmm/baddbmm.cc b/src/infinicore/ops/baddbmm/baddbmm.cc new file mode 100644 index 000000000..3a8ee1518 --- /dev/null +++ b/src/infinicore/ops/baddbmm/baddbmm.cc @@ -0,0 +1,100 @@ +#include "infinicore/ops/baddbmm.hpp" +#include "infinicore/ops/gemm.hpp" +#include "infinicore/ops/rearrange.hpp" + +namespace infinicore::op { + +// 内联的 BLAS 兼容性检查,减少函数调用开销 +inline bool is_blas_compatible(const Tensor &t) { + const auto ndim = t->ndim(); + if (ndim == 2) { + const auto rs = t->stride(0); + const auto cs = t->stride(1); + if (rs != 1 && cs != 1) { + return false; + } + if (rs == 1 && cs == 1) { + return t->shape()[0] == 1 || t->shape()[1] == 1; + } + return true; + } else if (ndim == 3) { + const auto rs = t->stride(1); + const auto cs = t->stride(2); + if (t->shape()[0] > 1 && t->stride(0) == 0) { + return false; + } + if (rs != 1 && cs != 1) { + return false; + } + if (rs == 1 && cs == 1) { + return t->shape()[1] == 1 || t->shape()[2] == 1; + } + return true; + } + return false; +} + +inline void prepare_gemm_input(Tensor &output, Tensor &input, const size_t batch_size, const size_t m, const size_t n) { + const auto input_ndim = input->ndim(); + if (input_ndim == 2) { + rearrange_(output, input->as_strided( + {batch_size, m, n}, + {0, input->stride(0), input->stride(1)})); + } else if (input_ndim == 3 && input->shape()[0] == 1 && batch_size > 1) { + rearrange_(output, input->as_strided( + {batch_size, m, n}, + {0, input->stride(1), input->stride(2)})); + } else { + rearrange_(output, input); + } +} + +Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2, + float beta, + float alpha) { + const size_t batch_size = batch1->shape()[0]; + const size_t m = batch1->shape()[1]; + const size_t n = batch2->shape()[2]; + + const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1); + const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2); + + if (beta == 0.0f) { + return gemm(a, b, alpha, 0.0f); + } + + Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device()); + + prepare_gemm_input(result, input, batch_size, m, n); + + gemm_(result, a, b, alpha, beta); + return result; +} + +void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, + float beta, + float alpha) { + const size_t batch_size = batch1->shape()[0]; + const size_t m = batch1->shape()[1]; + const size_t n = batch2->shape()[2]; + + const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1); + const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2); + + const bool out_is_usable = out->is_contiguous() && out->ndim() == 3 && out->shape()[0] == batch_size && out->shape()[1] == m && out->shape()[2] == n; + + if (out_is_usable) { + if (beta != 0.0f && input->data() != out->data()) { + prepare_gemm_input(out, input, batch_size, m, n); + } + gemm_(out, a, b, alpha, beta); + } else { + Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device()); + if (beta != 0.0f) { + prepare_gemm_input(result, input, batch_size, m, n); + } + gemm_(result, a, b, alpha, beta); + rearrange_(out, result); + } +} +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/bilinear/bilinear.cc b/src/infinicore/ops/bilinear/bilinear.cc new file mode 100644 index 000000000..ab88a28f9 --- /dev/null +++ b/src/infinicore/ops/bilinear/bilinear.cc @@ -0,0 +1,119 @@ +#include "infinicore/ops/bilinear.hpp" +#include "infinicore/ops/add.hpp" +#include "infinicore/ops/matmul.hpp" +#include "infinicore/ops/rearrange.hpp" + +#ifdef ENABLE_NVIDIA_API +namespace op::gemm::nvidia { +void set_tf32_enabled(bool); +} +#endif + +namespace infinicore::op { + +namespace { +// RAII 守卫:作用域内禁用 TF32 +struct ScopedTF32Disable { + ScopedTF32Disable() { +#ifdef ENABLE_NVIDIA_API + // 实际项目中建议添加检查,仅在 NVIDIA 设备上调用 + // 使用 ::op 强制从全局命名空间查找,避免被当前的 infinicore::op 遮蔽 + ::op::gemm::nvidia::set_tf32_enabled(false); +#endif + } + ~ScopedTF32Disable() { +#ifdef ENABLE_NVIDIA_API + ::op::gemm::nvidia::set_tf32_enabled(true); +#endif + } +}; + +inline bool is_gemm_compatible_3d(const Tensor &t) { + if (t->ndim() != 3) { + return false; + } + + const auto batch = t->shape()[0]; + const auto rows = t->shape()[1]; + const auto cols = t->shape()[2]; + const auto bs = t->stride(0); + const auto rs = t->stride(1); + const auto cs = t->stride(2); + + if (rs != 1 && cs != 1) { + return false; + } + + if (cs == 1) { + if (rs < static_cast(cols)) { + return false; + } + } else { + if (cs < static_cast(rows)) { + return false; + } + } + + if (batch > 1 && bs == 0) { + return false; + } + + return true; +} + +inline Tensor ensure_gemm_compatible(const Tensor &t) { + if (t->ndim() == 2) { + return t->is_contiguous() ? t : rearrange(t); + } else if (t->ndim() == 3) { + return is_gemm_compatible_3d(t) ? t : rearrange(t); + } + return t->is_contiguous() ? t : rearrange(t); +} + +} // anonymous namespace + +Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional bias) { + ScopedTF32Disable tf32_guard; + + const size_t batch_size = x1->shape()[0]; + const size_t in1_features = x1->shape()[1]; + const size_t in2_features = x2->shape()[1]; + const size_t out_features = weight->shape()[0]; + + Tensor x1_compat = ensure_gemm_compatible(x1); + Tensor x2_compat = ensure_gemm_compatible(x2); + Tensor weight_cont = weight->is_contiguous() ? weight : weight->contiguous(); + + Tensor weight_permuted = weight_cont->permute({1, 0, 2}); + Tensor weight_permuted_cont = weight_permuted->is_contiguous() + ? weight_permuted + : weight_permuted->contiguous(); + Tensor weight_matrix = weight_permuted_cont->view({in1_features, out_features * in2_features}); + + Tensor intermediate = matmul(x1_compat, weight_matrix, 1.0f); + + Tensor intermediate_3d = intermediate->view({batch_size, out_features, in2_features}); + Tensor intermediate_transposed = intermediate_3d->permute({0, 2, 1}); + Tensor intermediate_compat = ensure_gemm_compatible(intermediate_transposed); + + Tensor x2_row = x2_compat->view({batch_size, 1, in2_features}); + Tensor x2_row_compat = ensure_gemm_compatible(x2_row); + + Tensor out_3d = matmul(x2_row_compat, intermediate_compat, 1.0f); + Tensor out = out_3d->view({batch_size, out_features}); + + if (bias) { + Tensor bias_broadcast = (*bias)->as_strided( + {batch_size, out_features}, + {0, (*bias)->strides()[0]}); + out = add(out, bias_broadcast); + } + return out; +} + +void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional bias) { + Tensor result = bilinear(x1, x2, weight, bias); + rearrange_(out, result); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/fmod/fmod.cc b/src/infinicore/ops/fmod/fmod.cc new file mode 100644 index 000000000..30bee17d6 --- /dev/null +++ b/src/infinicore/ops/fmod/fmod.cc @@ -0,0 +1,28 @@ +#include "infinicore/ops/fmod.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Fmod::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Fmod::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor fmod(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); + fmod_(c, a, b); + return c; +} + +void fmod_(Tensor c, Tensor a, Tensor b) { + Fmod::execute(c, a, b); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/fmod/fmod_infiniop.cc b/src/infinicore/ops/fmod/fmod_infiniop.cc new file mode 100644 index 000000000..e796090d0 --- /dev/null +++ b/src/infinicore/ops/fmod/fmod_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/fmod.hpp" +#include + +namespace infinicore::op::fmod_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopFmodDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyFmodDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, b, a); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFmodDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateFmodDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetFmodWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopFmod( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + Fmod::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::fmod_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index b781fa843..1d0ace555 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -2,12 +2,17 @@ #include +#include "ops/adaptive_max_pool1d.hpp" #include "ops/add.hpp" #include "ops/add_rms_norm.hpp" +#include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/baddbmm.hpp" +#include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" +#include "ops/fmod.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" @@ -30,12 +35,18 @@ namespace py = pybind11; namespace infinicore::ops { inline void bind(py::module &m) { + bind_adaptive_max_pool1d(m); bind_add(m); bind_add_rms_norm(m); bind_attention(m); + bind_asinh(m); + bind_baddbmm(m); + bind_bilinear(m); bind_causal_softmax(m); bind_flash_attention(m); bind_kv_caching(m); + bind_fmod(m); + bind_random_sample(m); bind_linear(m); bind_matmul(m); bind_mul(m); diff --git a/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp b/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp new file mode 100644 index 000000000..747d92b9a --- /dev/null +++ b/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include "infinicore/ops/adaptive_max_pool1d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_adaptive_max_pool1d(py::module &m) { + m.def("adaptive_max_pool1d", + &op::adaptive_max_pool1d, + py::arg("x"), + py::arg("output_size"), + R"doc(1D Adaptive Max Pooling. + +Args: + x: Input tensor of shape (N, C, L_in) or (N, L_in) + output_size: Target output size L_out +Returns: + Output tensor of shape (N, C, L_out) or (N, L_out) +)doc"); + + m.def("adaptive_max_pool1d_", + &op::adaptive_max_pool1d_, + py::arg("y"), + py::arg("x"), + py::arg("output_size"), + R"doc(In-place 1D Adaptive Max Pooling. + +Args: + y: Output tensor of shape (N, C, L_out) or (N, L_out) + x: Input tensor of shape (N, C, L_in) or (N, L_in) + output_size: Target output size L_out +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/asinh.hpp b/src/infinicore/pybind11/ops/asinh.hpp new file mode 100644 index 000000000..bf1fcca23 --- /dev/null +++ b/src/infinicore/pybind11/ops/asinh.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/asinh.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_asinh(py::module &m) { + m.def("asinh", + &op::asinh, + py::arg("x"), + R"doc(Element-wise inverse hyperbolic sine function.)doc"); + + m.def("asinh_", + &op::asinh_, + py::arg("y"), + py::arg("x"), + R"doc(In-place element-wise inverse hyperbolic sine function.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/baddbmm.hpp b/src/infinicore/pybind11/ops/baddbmm.hpp new file mode 100644 index 000000000..3aef0ce20 --- /dev/null +++ b/src/infinicore/pybind11/ops/baddbmm.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "infinicore/ops/baddbmm.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +Tensor py_baddbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) { + return op::baddbmm(input, batch1, batch2, beta, alpha); +} + +void py_baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) { + op::baddbmm_(out, input, batch1, batch2, beta, alpha); +} + +inline void bind_baddbmm(py::module &m) { + m.def("baddbmm", + &py_baddbmm, + py::arg("input"), + py::arg("batch1"), + py::arg("batch2"), + py::arg("beta") = 1.0f, + py::arg("alpha") = 1.0f, + R"doc(Batched matrix-matrix product with addition. +Args: + input: Input tensor + batch1: First batch of matrices + batch2: Second batch of matrices + beta: Scaling factor for input tensor + alpha: Scaling factor for the product of batch1 and batch2 +Returns: + Output tensor after baddbmm operation +)doc"); + m.def("baddbmm_", + &py_baddbmm_, + py::arg("out"), + py::arg("input"), + py::arg("batch1"), + py::arg("batch2"), + py::arg("beta") = 1.0f, + py::arg("alpha") = 1.0f, + R"doc(In-place batched matrix-matrix product with addition. +Args: + out: Output tensor + input: Input tensor + batch1: First batch of matrices + batch2: Second batch of matrices + beta: Scaling factor for input tensor + alpha: Scaling factor for the product of batch1 and batch2 +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/bilinear.hpp b/src/infinicore/pybind11/ops/bilinear.hpp new file mode 100644 index 000000000..9c8ff80d6 --- /dev/null +++ b/src/infinicore/pybind11/ops/bilinear.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include + +#include "infinicore/ops/bilinear.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +Tensor py_bilinear(Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) { + std::optional bias_tensor = std::nullopt; + if (!bias.is_none()) { + bias_tensor = bias.cast(); + } + return op::bilinear(x1, x2, weight, bias_tensor); +} + +void py_bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) { + std::optional bias_tensor = std::nullopt; + if (!bias.is_none()) { + bias_tensor = bias.cast(); + } + op::bilinear_(out, x1, x2, weight, bias_tensor); +} + +inline void bind_bilinear(py::module &m) { + m.def("bilinear", + &py_bilinear, + py::arg("x1"), + py::arg("x2"), + py::arg("weight"), + py::arg("bias"), + R"doc(Bilinear transformation of two input tensors. +Args: + x1: First input tensor + x2: Second input tensor + weight: Weight tensor + bias: Bias tensor (optional) +Returns: + Output tensor after bilinear transformation +)doc"); + + m.def("bilinear_", + &py_bilinear_, + py::arg("out"), + py::arg("x1"), + py::arg("x2"), + py::arg("weight"), + py::arg("bias"), + R"doc(In-place bilinear transformation of two input tensors. +Args: + out: Output tensor + x1: First input tensor + x2: Second input tensor + weight: Weight tensor + bias: Bias tensor (optional) +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/fmod.hpp b/src/infinicore/pybind11/ops/fmod.hpp new file mode 100644 index 000000000..97af57da2 --- /dev/null +++ b/src/infinicore/pybind11/ops/fmod.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/fmod.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_fmod(py::module &m) { + m.def("fmod", + &op::fmod, + py::arg("a"), + py::arg("b"), + R"doc(Element-wise floating point remainder of division of two tensors.)doc"); + + m.def("fmod_", + &op::fmod_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place element-wise floating point remainder of division of two tensors.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h b/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h new file mode 100644 index 000000000..288c2ece4 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h @@ -0,0 +1,47 @@ +#ifndef ADAPTIVE_MAX_POOL1D_H +#define ADAPTIVE_MAX_POOL1D_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::adaptive_max_pool1d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AdaptiveMaxPool1dInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + AdaptiveMaxPool1dInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t output_size); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // ADAPTIVE_MAX_POOL1D_H \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc new file mode 100644 index 000000000..69edf83bc --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc @@ -0,0 +1,98 @@ +#include "adaptive_max_pool1d_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include +#include + +namespace op::adaptive_max_pool1d::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) { + + const size_t ndim = info->ndim(); + const size_t batch_size = info->shape[0]; + const size_t channels = ndim > 2 ? info->shape[1] : 1; + + const size_t input_length = info->input_length(); + const size_t output_length = info->output_length(); + + // 计算总的任务块数 (Batch * Channels) + const ptrdiff_t total_blocks = static_cast(batch_size * channels); + + const ptrdiff_t x_stride_last = info->x_strides.back(); + +#pragma omp parallel for + for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) { + const size_t i = block_idx / channels; // batch index + const size_t j = block_idx % channels; // channel index + + const T *x_ptr_base; + T *y_ptr_base; + + if (ndim > 2) { // (N, C, L) + x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1]; + y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1]; + } else { // (N, L) + x_ptr_base = x + i * info->x_strides[0]; + y_ptr_base = y + i * info->y_strides[0]; + } + + for (size_t out_idx = 0; out_idx < output_length; ++out_idx) { + // 计算池化窗口范围 [start_index, end_index) + // 公式参考 PyTorch: + // start = floor(out_idx * L_in / L_out) + // end = ceil((out_idx + 1) * L_in / L_out) + int start_index = std::floor((float)out_idx * input_length / output_length); + int end_index = std::ceil((float)(out_idx + 1) * input_length / output_length); + + start_index = std::max(start_index, 0); + end_index = std::min(end_index, (int)input_length); + int window_len = end_index - start_index; + + if (window_len <= 0) { + continue; + } + + const T *window_ptr = x_ptr_base + start_index * x_stride_last; + + auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last); + y_ptr_base[out_idx] = utils::cast(max_val); + } + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream) const { + + if (_info.atype == INFINI_DTYPE_F32) { + return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x); + } else if (_info.atype == INFINI_DTYPE_F16) { + return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x); + } else if (_info.atype == INFINI_DTYPE_BF16) { + return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x); + } else if (_info.atype == INFINI_DTYPE_F64) { + return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x); + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +} // namespace op::adaptive_max_pool1d::cpu \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h new file mode 100644 index 000000000..f3e8ced3c --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__ +#define __ADAPTIVE_MAX_POOL1D_CPU_H__ + +#include "../adaptive_max_pool1d.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh new file mode 100644 index 000000000..814688846 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh @@ -0,0 +1,54 @@ +#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__ +#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__ + +#include +#include + +template +__device__ void adaptiveMaxPool1dBlock( + Tdata *__restrict__ y, + ptrdiff_t stride_y_batch, + ptrdiff_t stride_y_channel, + const Tdata *__restrict__ x, + ptrdiff_t stride_x_batch, + ptrdiff_t stride_x_channel, + ptrdiff_t stride_x_length, + size_t channels, + size_t input_length, + size_t output_length, + size_t ndim) { + + size_t block_idx = blockIdx.x; + size_t batch_idx = block_idx / channels; + size_t channel_idx = block_idx % channels; + + const Tdata *x_ptr; + Tdata *y_ptr; + + if (ndim > 2) { + x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel; + y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel; + } else { + x_ptr = x + batch_idx * stride_x_batch; + y_ptr = y + batch_idx * stride_y_batch; + } + + for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) { + int start_index = static_cast(floorf((float)out_idx * input_length / output_length)); + int end_index = static_cast(ceilf((float)(out_idx + 1) * input_length / output_length)); + + if (end_index <= start_index) { + continue; + } + + Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]); + for (int i = start_index + 1; i < end_index; ++i) { + Tcompute val = Tcompute(x_ptr[i * stride_x_length]); + max_val = max(max_val, val); + } + + y_ptr[out_idx] = Tdata(max_val); + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/info.h b/src/infiniop/ops/adaptive_max_pool1d/info.h new file mode 100644 index 000000000..7194d2d93 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/info.h @@ -0,0 +1,65 @@ +#ifndef __ADAPATIVE_MAX_POOL1D_H__ +#define __ADAPATIVE_MAX_POOL1D_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::adaptive_max_pool1d { + +class AdaptiveMaxPool1dInfo { + AdaptiveMaxPool1dInfo() = default; + +public: + infiniDtype_t atype; + std::vector shape; + std::vector y_strides; + std::vector x_strides; + size_t input_size; + size_t output_size; + size_t ndim() const { return shape.size(); } + size_t input_length() const { return input_size; } + size_t output_length() const { return output_size; } + + static utils::Result create( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + + auto atype = y_desc->dtype(); + if (x_desc->dtype() != atype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + const size_t y_ndim = y_desc->ndim(); + const size_t x_ndim = x_desc->ndim(); + + if (y_ndim != x_ndim) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t i = 0; i < y_ndim - 1; ++i) { + if (x_desc->dim(i) != y_desc->dim(i)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + if (y_desc->dim(y_ndim - 1) != output_size) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + return utils::Result(AdaptiveMaxPool1dInfo{ + atype, + y_desc->shape(), + y_desc->strides(), + x_desc->strides(), + x_desc->dim(x_ndim - 1), + output_size}); + } +}; +} // namespace op::adaptive_max_pool1d + +#endif // __ADAPATIVE_MAX_POOL1D_H__ \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh new file mode 100644 index 000000000..fcd068b6d --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__ +#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__ + +#include "../adaptive_max_pool1d.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca new file mode 100644 index 000000000..f72aae852 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca @@ -0,0 +1,130 @@ +#include "../../../devices/metax/metax_common.h" +#include "adaptive_max_pool1d_metax.cuh" + +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel( + Tdata *__restrict__ y, + ptrdiff_t stride_y_batch, + ptrdiff_t stride_y_channel, + const Tdata *__restrict__ x, + ptrdiff_t stride_x_batch, + ptrdiff_t stride_x_channel, + ptrdiff_t stride_x_length, + size_t channels, + size_t input_length, + size_t output_length, + size_t ndim) { + + adaptiveMaxPool1dBlock( + y, stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length,ndim); +} + +namespace op::adaptive_max_pool1d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor(){ + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + + auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(result); + auto info = result.take(); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + std::move(info), + 0, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel( + uint32_t numblock, + void *y, infiniDtype_t dtype, + ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel, + const void *x, + ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length, + size_t channels, size_t input_length, size_t output_length, size_t ndim, + hcStream_t stream){ + +#define LAUNCH_KERNEL(Tdata, Tcompute) \ + adaptiveMaxPool1dKernel<<>> ( \ + reinterpret_cast(y), \ + stride_y_batch, stride_y_channel, \ + reinterpret_cast(x), \ + stride_x_batch, stride_x_channel, stride_x_length, \ + channels, input_length, output_length, ndim) + + if (dtype == INFINI_DTYPE_F16) { + LAUNCH_KERNEL(half, float); + } else if (dtype == INFINI_DTYPE_BF16) { + LAUNCH_KERNEL(__hpcc_bfloat16, float); + } else if (dtype == INFINI_DTYPE_F32) { + LAUNCH_KERNEL(float, float); + } else if (dtype == INFINI_DTYPE_F64) { + LAUNCH_KERNEL(double, double); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +#undef LAUNCH_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const size_t ndim = _info.ndim(); + const size_t batch_size = _info.shape[0]; + const size_t channels = ndim > 2 ? _info.shape[1] : 1; + const size_t input_length = _info.input_length(); + const size_t output_length = _info.output_length(); + + ptrdiff_t stride_x_batch = _info.x_strides[0]; + ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0; + ptrdiff_t stride_x_length = _info.x_strides.back(); + + ptrdiff_t stride_y_batch = _info.y_strides[0]; + ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0; + + uint32_t num_blocks = static_cast(batch_size * channels); + auto stream = reinterpret_cast(stream_); + + if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::adaptive_max_pool1d::metax \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h new file mode 100644 index 000000000..c56ad6fd4 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__ +#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__ + +#include "../adaptive_max_pool1d.h" + +DESCRIPTOR(moore) + +#endif diff --git a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu new file mode 100644 index 000000000..256392f78 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu @@ -0,0 +1,144 @@ +#include "../../../devices/moore/moore_common.h" +#include "adaptive_max_pool1d_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel( + Tdata *__restrict__ y, + ptrdiff_t stride_y_batch, + ptrdiff_t stride_y_channel, + const Tdata *__restrict__ x, + ptrdiff_t stride_x_batch, + ptrdiff_t stride_x_channel, + ptrdiff_t stride_x_length, + size_t channels, + size_t input_length, + size_t output_length, + size_t ndim){ + + adaptiveMaxPool1dBlock( + y, stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim); +} + +namespace op::adaptive_max_pool1d::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(result); + auto info = result.take(); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + std::move(info), + 0, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel( + uint32_t num_blocks, + void *y, infiniDtype_t dtype, + ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel, + const void *x, + ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length, + size_t channels, size_t input_length, size_t output_length, size_t ndim, + musaStream_t musa_stream) { + +#define LAUNCH_KERNEL(Tdata, Tcompute) \ + adaptiveMaxPool1dKernel<<>>( \ + reinterpret_cast(y), \ + stride_y_batch, stride_y_channel, \ + reinterpret_cast(x), \ + stride_x_batch, stride_x_channel, stride_x_length, \ + channels, input_length, output_length, ndim) + + if (dtype == INFINI_DTYPE_F16) { + LAUNCH_KERNEL(half, float); + } else if (dtype == INFINI_DTYPE_BF16) { + LAUNCH_KERNEL(__mt_bfloat16, float); + } else if (dtype == INFINI_DTYPE_F32) { + LAUNCH_KERNEL(float, float); + } else if (dtype == INFINI_DTYPE_F64) { + LAUNCH_KERNEL(double, double); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + +#undef LAUNCH_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const size_t ndim = _info.ndim(); + const size_t batch_size = _info.shape[0]; + const size_t channels = ndim > 2 ? _info.shape[1] : 1; + const size_t input_length = _info.input_length(); + const size_t output_length = _info.output_length(); + + ptrdiff_t stride_x_batch = _info.x_strides[0]; + ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0; + ptrdiff_t stride_x_length = _info.x_strides.back(); + + ptrdiff_t stride_y_batch = _info.y_strides[0]; + ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0; + + uint32_t num_blocks = static_cast(batch_size * channels); + auto musa_stream = reinterpret_cast(stream); + + if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + musa_stream)); + } else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + musa_stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + musa_stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::adaptive_max_pool1d::moore \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu new file mode 100644 index 000000000..96ffe573f --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu @@ -0,0 +1,144 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "adaptive_max_pool1d_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL adaptiveMaxPool1dKernel( + Tdata *__restrict__ y, + ptrdiff_t stride_y_batch, + ptrdiff_t stride_y_channel, + const Tdata *__restrict__ x, + ptrdiff_t stride_x_batch, + ptrdiff_t stride_x_channel, + ptrdiff_t stride_x_length, + size_t channels, + size_t input_length, + size_t output_length, + size_t ndim) { + + adaptiveMaxPool1dBlock( + y, stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim); +} + +namespace op::adaptive_max_pool1d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(result); + auto info = result.take(); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + std::move(info), + 0, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel( + uint32_t num_blocks, + void *y, infiniDtype_t dtype, + ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel, + const void *x, + ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length, + size_t channels, size_t input_length, size_t output_length, size_t ndim, + cudaStream_t cuda_stream) { + +#define LAUNCH_KERNEL(Tdata, Tcompute) \ + adaptiveMaxPool1dKernel<<>>( \ + reinterpret_cast(y), \ + stride_y_batch, stride_y_channel, \ + reinterpret_cast(x), \ + stride_x_batch, stride_x_channel, stride_x_length, \ + channels, input_length, output_length, ndim) + + if (dtype == INFINI_DTYPE_F16) { + LAUNCH_KERNEL(half, float); + } else if (dtype == INFINI_DTYPE_BF16) { + LAUNCH_KERNEL(__nv_bfloat16, float); + } else if (dtype == INFINI_DTYPE_F32) { + LAUNCH_KERNEL(float, float); + } else if (dtype == INFINI_DTYPE_F64) { + LAUNCH_KERNEL(double, double); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + +#undef LAUNCH_KERNEL + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const size_t ndim = _info.ndim(); + const size_t batch_size = _info.shape[0]; + const size_t channels = ndim > 2 ? _info.shape[1] : 1; + const size_t input_length = _info.input_length(); + const size_t output_length = _info.output_length(); + + ptrdiff_t stride_x_batch = _info.x_strides[0]; + ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0; + ptrdiff_t stride_x_length = _info.x_strides.back(); + + ptrdiff_t stride_y_batch = _info.y_strides[0]; + ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0; + + uint32_t num_blocks = static_cast(batch_size * channels); + auto cuda_stream = reinterpret_cast(stream); + + if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + cuda_stream)); + } else if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + cuda_stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + num_blocks, y, _info.atype, + stride_y_batch, stride_y_channel, + x, stride_x_batch, stride_x_channel, stride_x_length, + channels, input_length, output_length, ndim, + cuda_stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::adaptive_max_pool1d::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh new file mode 100644 index 000000000..b980ce269 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_H__ +#define __ADAPTIVE_MAX_POOL1D_CUDA_H__ + +#include "../adaptive_max_pool1d.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_max_pool1d/operator.cc b/src/infiniop/ops/adaptive_max_pool1d/operator.cc new file mode 100644 index 000000000..7048a1033 --- /dev/null +++ b/src/infiniop/ops/adaptive_max_pool1d/operator.cc @@ -0,0 +1,147 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/adaptive_max_pool1d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/adaptive_max_pool1d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/adaptive_max_pool1d_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/adaptive_max_pool1d_metax.cuh" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/adaptive_max_pool1d_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveMaxPool1dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t output_size) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::adaptive_max_pool1d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + output_size) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + } +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize( + infiniopAdaptiveMaxPool1dDescriptor_t desc, + size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAdaptiveMaxPool1d( + infiniopAdaptiveMaxPool1dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + } +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor( + infiniopAdaptiveMaxPool1dDescriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore); +#endif + } +#undef DESTROY + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc new file mode 100644 index 000000000..4d7627473 --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc @@ -0,0 +1,50 @@ +#include "asinh_cpu.h" + +namespace op::asinh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::asinh::cpu \ No newline at end of file diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h new file mode 100644 index 000000000..076fcb30a --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ASINH_CPU_H__ +#define __ASINH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(asinh, cpu) + +namespace op::asinh::cpu { +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::asinh(x); + } +} AsinhOp; +} // namespace op::asinh::cpu + +#endif // __ASINH_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh new file mode 100644 index 000000000..2bd6dcbf0 --- /dev/null +++ b/src/infiniop/ops/asinh/cuda/kernel.cuh @@ -0,0 +1,29 @@ +#ifndef __ASINH_CUDA_KERNEL_H__ +#define __ASINH_CUDA_KERNEL_H__ + +namespace op::asinh::cuda { + +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + + if constexpr (std::is_same_v) { + float x_f = __half2float(x); + return __float2half(asinhf(x_f)); + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + return __float2bfloat16(asinhf(x_f)); + } else if constexpr (std::is_same_v) { + return asinhf(x); + } else { + return ::asinh(x); + } + } + +} AsinhOp; + +} // namespace op::asinh::cuda + +#endif // __ASINH_CUDA_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asinh/metax/asinh.maca b/src/infiniop/ops/asinh/metax/asinh.maca new file mode 100644 index 000000000..f6f4ac3f9 --- /dev/null +++ b/src/infiniop/ops/asinh/metax/asinh.maca @@ -0,0 +1,58 @@ +#include "asinh_metax.h" +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::asinh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::asinh::metax diff --git a/src/infiniop/ops/asinh/metax/asinh_metax.h b/src/infiniop/ops/asinh/metax/asinh_metax.h new file mode 100644 index 000000000..dacb77f0d --- /dev/null +++ b/src/infiniop/ops/asinh/metax/asinh_metax.h @@ -0,0 +1,8 @@ +#ifndef __ASINH_METAX_API_H__ +#define __ASINH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(asinh, metax) + +#endif // __ASINH_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asinh/moore/asinh_moore.h b/src/infiniop/ops/asinh/moore/asinh_moore.h new file mode 100644 index 000000000..36c93d53a --- /dev/null +++ b/src/infiniop/ops/asinh/moore/asinh_moore.h @@ -0,0 +1,8 @@ +#ifndef __ASINH_MOORE_API_H__ +#define __ASINH_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(asinh, moore) + +#endif // __ASINH_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asinh/moore/asinh_moore.mu b/src/infiniop/ops/asinh/moore/asinh_moore.mu new file mode 100644 index 000000000..35a8d6475 --- /dev/null +++ b/src/infiniop/ops/asinh/moore/asinh_moore.mu @@ -0,0 +1,59 @@ +#include "asinh_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" + +namespace op::asinh::moore { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asinh::moore \ No newline at end of file diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu new file mode 100644 index 000000000..77a4652bc --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu @@ -0,0 +1,56 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asinh_nvidia.cuh" + +namespace op::asinh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::asinh::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh new file mode 100644 index 000000000..5b75a553c --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASINH_NVIDIA_API_H__ +#define __ASINH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asinh, nvidia) + +#endif // __ASINH_NVIDIA_API_H \ No newline at end of file diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc new file mode 100644 index 000000000..5c48902c7 --- /dev/null +++ b/src/infiniop/ops/asinh/operator.cc @@ -0,0 +1,141 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asinh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asinh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/asinh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/asinh_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/asinh_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAsinhDescriptor( + infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asinh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream); + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} \ No newline at end of file diff --git a/src/infiniop/ops/fmod/cpu/fmod_cpu.cc b/src/infiniop/ops/fmod/cpu/fmod_cpu.cc new file mode 100644 index 000000000..1f27290de --- /dev/null +++ b/src/infiniop/ops/fmod/cpu/fmod_cpu.cc @@ -0,0 +1,53 @@ +#include "fmod_cpu.h" + +namespace op::fmod::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &out_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmod::cpu diff --git a/src/infiniop/ops/fmod/cpu/fmod_cpu.h b/src/infiniop/ops/fmod/cpu/fmod_cpu.h new file mode 100644 index 000000000..54af25540 --- /dev/null +++ b/src/infiniop/ops/fmod/cpu/fmod_cpu.h @@ -0,0 +1,19 @@ +#ifndef _FMOD_CPU_H__ +#define _FMOD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(fmod, cpu) + +namespace op::fmod::cpu { +typedef struct FmodOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::fmod(a, b); + } +} FmodOp; +} // namespace op::fmod::cpu + +#endif // _FMOD_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/fmod/cuda/kernel.cuh b/src/infiniop/ops/fmod/cuda/kernel.cuh new file mode 100644 index 000000000..6e30ed25e --- /dev/null +++ b/src/infiniop/ops/fmod/cuda/kernel.cuh @@ -0,0 +1,48 @@ +#ifndef __FMOD_CUDA_H__ +#define __FMOD_CUDA_H__ + +namespace op::fmod::cuda { +typedef struct FmodOp { + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + // fmod(a, b) = a - b * trunc(a / b) + if constexpr (std::is_same_v) { + // 对于 half2,转换为 float 计算后再转回 + float2 af = __half22float2(a); + float2 bf = __half22float2(b); + float2 result; + result.x = ::fmodf(af.x, bf.x); + result.y = ::fmodf(af.y, bf.y); + return __float22half2_rn(result); + } else if constexpr (std::is_same_v) { + // 对于 bfloat162,转换为 float 计算后再转回 + float af_low = __bfloat162float(__low2bfloat16(a)); + float af_high = __bfloat162float(__high2bfloat16(a)); + float bf_low = __bfloat162float(__low2bfloat16(b)); + float bf_high = __bfloat162float(__high2bfloat16(b)); + return __floats2bfloat162_rn(::fmodf(af_low, bf_low), ::fmodf(af_high, bf_high)); + } else if constexpr (std::is_same_v) { + // 对于 half,转换为 float 计算后再转回 + float af = __half2float(a); + float bf = __half2float(b); + return __float2half(::fmodf(af, bf)); + } else if constexpr (std::is_same_v) { + // 对于 bfloat16,转换为 float 计算后再转回 + float af = __bfloat162float(a); + float bf = __bfloat162float(b); + return __float2bfloat16(::fmodf(af, bf)); + } else if constexpr (std::is_same_v) { + return ::fmodf(a, b); + } else if constexpr (std::is_same_v) { + return ::fmod(a, b); + } else { + // 整数类型使用 % 运算符 + return a % b; + } + } +} FmodOp; + +} // namespace op::fmod::cuda + +#endif // __FMOD_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/fmod/metax/fmod_metax.h b/src/infiniop/ops/fmod/metax/fmod_metax.h new file mode 100644 index 000000000..ad5769231 --- /dev/null +++ b/src/infiniop/ops/fmod/metax/fmod_metax.h @@ -0,0 +1,8 @@ +#ifndef __FMOD_METAX_API_H__ +#define __FMOD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(fmod, metax) + +#endif // __FMOD_METAX_API_H__ diff --git a/src/infiniop/ops/fmod/metax/mul_metax.maca b/src/infiniop/ops/fmod/metax/mul_metax.maca new file mode 100644 index 000000000..c9d54ad62 --- /dev/null +++ b/src/infiniop/ops/fmod/metax/mul_metax.maca @@ -0,0 +1,61 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +#include "fmod_metax.h" + +namespace op::fmod::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FmodOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FmodOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FmodOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FmodOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmod::metax diff --git a/src/infiniop/ops/fmod/moore/fmod_moore.h b/src/infiniop/ops/fmod/moore/fmod_moore.h new file mode 100644 index 000000000..b24c337a8 --- /dev/null +++ b/src/infiniop/ops/fmod/moore/fmod_moore.h @@ -0,0 +1,8 @@ +#ifndef __FMOD_MOORE_API_H__ +#define __FMOD_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(fmod, moore) + +#endif // __FMOD_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/fmod/moore/fmod_moore.mu b/src/infiniop/ops/fmod/moore/fmod_moore.mu new file mode 100644 index 000000000..0c37da459 --- /dev/null +++ b/src/infiniop/ops/fmod/moore/fmod_moore.mu @@ -0,0 +1,63 @@ +#include "fmod_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" + +namespace op::fmod::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FmodOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FmodOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FmodOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FmodOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::fmod::moore \ No newline at end of file diff --git a/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu b/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu new file mode 100644 index 000000000..a74295264 --- /dev/null +++ b/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "fmod_nvidia.cuh" + +namespace op::fmod::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FmodOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FmodOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FmodOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FmodOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmod::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh b/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh new file mode 100644 index 000000000..e40d0088d --- /dev/null +++ b/src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MUL_CUDA_API_H__ +#define __MUL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(fmod, nvidia) + +#endif // __MUL_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/fmod/operator.cc b/src/infiniop/ops/fmod/operator.cc new file mode 100644 index 000000000..1fd433c4a --- /dev/null +++ b/src/infiniop/ops/fmod/operator.cc @@ -0,0 +1,152 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/fmod.h" + +#ifdef ENABLE_CPU_API +#include "cpu/fmod_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/fmod_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/fmod_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/fmod_moore.h" +#endif + +__C infiniStatus_t infiniopCreateFmodDescriptor( + infiniopHandle_t handle, + infiniopFmodDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::fmod::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFmod( + infiniopFmodDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/gemm/cpu/gemm_cpu.cc b/src/infiniop/ops/gemm/cpu/gemm_cpu.cc index d19965614..6f7a2e3e0 100644 --- a/src/infiniop/ops/gemm/cpu/gemm_cpu.cc +++ b/src/infiniop/ops/gemm/cpu/gemm_cpu.cc @@ -64,7 +64,11 @@ void calculate( *c_ = utils::cast(beta * utils::cast(*c_) + alpha * sum); } } else { - *c_ = beta * (*c_) + alpha * sum; + if (beta == 0) { + *c_ = alpha * sum; + } else { + *c_ = beta * (*c_) + alpha * sum; + } } } } diff --git a/src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu b/src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu index 0e0c65f2b..580cca658 100644 --- a/src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu +++ b/src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu @@ -3,6 +3,14 @@ namespace op::gemm::nvidia { +// 添加线程局部控制开关 +thread_local bool g_tf32_enabled = true; + +// 暴露设置函数(非静态,以便外部链接) +void set_tf32_enabled(bool enabled) { + g_tf32_enabled = enabled; +} + struct Descriptor::Opaque { std::shared_ptr internal; }; @@ -71,7 +79,8 @@ infiniStatus_t Descriptor::calculate( #if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) compute_type = CUDA_R_32F; #else - compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; + // compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; + compute_type = g_tf32_enabled ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; #endif break; diff --git a/test/infinicore/ops/adaptive_max_pool1d.py b/test/infinicore/ops/adaptive_max_pool1d.py index 00fb332fd..d8605c18d 100644 --- a/test/infinicore/ops/adaptive_max_pool1d.py +++ b/test/infinicore/ops/adaptive_max_pool1d.py @@ -67,9 +67,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.adaptive_max_pool1d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.adaptive_max_pool1d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.adaptive_max_pool1d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/asinh.py b/test/infinicore/ops/asinh.py index 79452d336..977123df2 100644 --- a/test/infinicore/ops/asinh.py +++ b/test/infinicore/ops/asinh.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.asinh(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.asinh(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.asinh(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/baddbmm.py b/test/infinicore/ops/baddbmm.py index 61bb97864..a5524e2fa 100644 --- a/test/infinicore/ops/baddbmm.py +++ b/test/infinicore/ops/baddbmm.py @@ -99,9 +99,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.baddbmm(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.baddbmm(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.baddbmm(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/bilinear.py b/test/infinicore/ops/bilinear.py index f0c606399..deccc9e01 100644 --- a/test/infinicore/ops/bilinear.py +++ b/test/infinicore/ops/bilinear.py @@ -44,11 +44,17 @@ def parse_test_cases(): in2 = TensorSpec.from_tensor(in2_shape, in2_strides, dtype) weight = TensorSpec.from_tensor(weight_shape, weight_strides, dtype) + inputs = [in1, in2, weight] + if bias_present: + bias_shape = (weight_shape[0],) + bias = TensorSpec.from_tensor(bias_shape, None, dtype) + inputs.append(bias) + kwargs = {} test_cases.append( TestCase( - inputs=[in1, in2, weight], + inputs=inputs, kwargs=kwargs, output_spec=None, comparison_target=None, @@ -72,9 +78,10 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.bilinear(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.bilinear(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + from infinicore.ops.bilinear import bilinear + + return bilinear(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/fmod.py b/test/infinicore/ops/fmod.py index 66bdee38d..8543e0f5b 100644 --- a/test/infinicore/ops/fmod.py +++ b/test/infinicore/ops/fmod.py @@ -103,9 +103,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.fmod(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.fmod(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.fmod(*args, **kwargs) def main(): From 6a0503fc966f788d718c6be5a36b8d12bb64c459 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 6 Mar 2026 06:52:49 +0000 Subject: [PATCH 2/6] issue/1031 fix xamke cpp api link --- xmake.lua | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xmake.lua b/xmake.lua index dc17cdfa3..ca2712ff6 100644 --- a/xmake.lua +++ b/xmake.lua @@ -452,8 +452,7 @@ target("infinicore_cpp_api") set_kind("shared") add_deps("infiniop", "infinirt", "infiniccl") set_languages("cxx17") - set_symbols("visibility") - + set_policy("build.optimization.lto", false) local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") add_includedirs("include") From 2904ec00cc8faf1b528b3b78901ecaff40909dc1 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 6 Mar 2026 07:07:13 +0000 Subject: [PATCH 3/6] issue/1031 fix T1-1-9 format --- include/infiniop/ops/adaptive_max_pool1d.h | 10 +++---- include/infiniop/ops/asinh.h | 24 ++++++++-------- include/infiniop/ops/fmod.h | 28 +++++++++---------- python/infinicore/__init__.py | 8 +++--- .../nn/functional/adaptive_max_pool1d.py | 2 -- .../ops/adaptive_max_pool1d/operator.cc | 8 +++--- src/infiniop/ops/asinh/operator.cc | 18 ++++++------ src/infiniop/ops/fmod/operator.cc | 8 +++--- 8 files changed, 52 insertions(+), 54 deletions(-) diff --git a/include/infiniop/ops/adaptive_max_pool1d.h b/include/infiniop/ops/adaptive_max_pool1d.h index 484413e21..67876d488 100644 --- a/include/infiniop/ops/adaptive_max_pool1d.h +++ b/include/infiniop/ops/adaptive_max_pool1d.h @@ -5,18 +5,18 @@ typedef struct InfiniopDescriptor *infiniopAdaptiveMaxPool1dDescriptor_t; -__C __export infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( infiniopHandle_t handle, infiniopAdaptiveMaxPool1dDescriptor_t *desc, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t output_size); -__C __export infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(infiniopAdaptiveMaxPool1dDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(infiniopAdaptiveMaxPool1dDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAdaptiveMaxPool1d(infiniopAdaptiveMaxPool1dDescriptor_t desc, void *workspace, size_t workspace_size, - void *y, const void *x, void *stream); +__INFINI_C __export infiniStatus_t infiniopAdaptiveMaxPool1d(infiniopAdaptiveMaxPool1dDescriptor_t desc, void *workspace, size_t workspace_size, + void *y, const void *x, void *stream); -__C __export infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(infiniopAdaptiveMaxPool1dDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(infiniopAdaptiveMaxPool1dDescriptor_t desc); #endif \ No newline at end of file diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h index 98cfa3a0e..99bf0363e 100644 --- a/include/infiniop/ops/asinh.h +++ b/include/infiniop/ops/asinh.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; -__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, - infiniopAsinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); #endif \ No newline at end of file diff --git a/include/infiniop/ops/fmod.h b/include/infiniop/ops/fmod.h index f2a64ecf9..ec989e38e 100644 --- a/include/infiniop/ops/fmod.h +++ b/include/infiniop/ops/fmod.h @@ -5,22 +5,22 @@ typedef struct InfiniopDescriptor *infiniopFmodDescriptor_t; -__C __export infiniStatus_t infiniopCreateFmodDescriptor(infiniopHandle_t handle, - infiniopFmodDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); +__INFINI_C __export infiniStatus_t infiniopCreateFmodDescriptor(infiniopHandle_t handle, + infiniopFmodDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); -__C __export infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopFmod(infiniopFmodDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); +__INFINI_C __export infiniStatus_t infiniopFmod(infiniopFmodDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); -__C __export infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc); #endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5765db719..46249178d 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -49,9 +49,12 @@ ) from infinicore.ops.add import add from infinicore.ops.add_rms_norm import add_rms_norm +from infinicore.ops.asinh import asinh from infinicore.ops.attention import attention +from infinicore.ops.baddbmm import baddbmm +from infinicore.ops.bilinear import bilinear +from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching -from infinicore.ops.asinh import asinh from infinicore.ops.matmul import matmul from infinicore.ops.mha_varlen import mha_varlen from infinicore.ops.mul import mul @@ -62,9 +65,6 @@ from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze -from infinicore.ops.baddbmm import baddbmm -from infinicore.ops.bilinear import bilinear -from infinicore.ops.fmod import fmod from infinicore.tensor import ( Tensor, empty, diff --git a/python/infinicore/nn/functional/adaptive_max_pool1d.py b/python/infinicore/nn/functional/adaptive_max_pool1d.py index 74a8c56e9..8271a231f 100644 --- a/python/infinicore/nn/functional/adaptive_max_pool1d.py +++ b/python/infinicore/nn/functional/adaptive_max_pool1d.py @@ -1,5 +1,3 @@ -from typing import List - from infinicore.lib import _infinicore from infinicore.tensor import Tensor diff --git a/src/infiniop/ops/adaptive_max_pool1d/operator.cc b/src/infiniop/ops/adaptive_max_pool1d/operator.cc index 7048a1033..e538829d9 100644 --- a/src/infiniop/ops/adaptive_max_pool1d/operator.cc +++ b/src/infiniop/ops/adaptive_max_pool1d/operator.cc @@ -15,7 +15,7 @@ #include "moore/adaptive_max_pool1d_moore.h" #endif -__C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( infiniopHandle_t handle, infiniopAdaptiveMaxPool1dDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -53,7 +53,7 @@ __C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize( +__INFINI_C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize( infiniopAdaptiveMaxPool1dDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ @@ -83,7 +83,7 @@ __C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAdaptiveMaxPool1d( +__INFINI_C infiniStatus_t infiniopAdaptiveMaxPool1d( infiniopAdaptiveMaxPool1dDescriptor_t desc, void *workspace, size_t workspace_size, @@ -117,7 +117,7 @@ __C infiniStatus_t infiniopAdaptiveMaxPool1d( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor( +__INFINI_C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor( infiniopAdaptiveMaxPool1dDescriptor_t desc) { #define DESTROY(CASE, NAMESPACE) \ case CASE: \ diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc index 5c48902c7..63d818c76 100644 --- a/src/infiniop/ops/asinh/operator.cc +++ b/src/infiniop/ops/asinh/operator.cc @@ -15,7 +15,7 @@ #include "moore/asinh_moore.h" #endif -__C infiniStatus_t infiniopCreateAsinhDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAsinhDescriptor( infiniopHandle_t handle, infiniopAsinhDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -50,7 +50,7 @@ __C infiniStatus_t infiniopCreateAsinhDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -80,12 +80,12 @@ __C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { +__INFINI_C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { #define CALCULATE(CASE, NAMESPACE) \ case CASE: \ return reinterpret_cast(desc) \ @@ -112,7 +112,7 @@ __C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, #undef CALCULATE } -__C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { #define DESTROY(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ diff --git a/src/infiniop/ops/fmod/operator.cc b/src/infiniop/ops/fmod/operator.cc index 1fd433c4a..da934f6be 100644 --- a/src/infiniop/ops/fmod/operator.cc +++ b/src/infiniop/ops/fmod/operator.cc @@ -15,7 +15,7 @@ #include "moore/fmod_moore.h" #endif -__C infiniStatus_t infiniopCreateFmodDescriptor( +__INFINI_C infiniStatus_t infiniopCreateFmodDescriptor( infiniopHandle_t handle, infiniopFmodDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, @@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateFmodDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -84,7 +84,7 @@ __C infiniStatus_t infiniopGetFmodWorkspaceSize(infiniopFmodDescriptor_t desc, s return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopFmod( +__INFINI_C infiniStatus_t infiniopFmod( infiniopFmodDescriptor_t desc, void *workspace, size_t workspace_size, @@ -119,7 +119,7 @@ __C infiniStatus_t infiniopFmod( #undef CALCULATE } -__C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc) { #define GET(CASE, NAMESPACE) \ case CASE: \ From af0a14757635ec99f6defd9045aba9c7915822d8 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 6 Mar 2026 07:58:31 +0000 Subject: [PATCH 4/6] issue/1031 fix T1-1-9 compile --- .../cpu/adaptive_max_pool1d_cpu.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc index 69edf83bc..272ef541f 100644 --- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc +++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc @@ -52,16 +52,12 @@ infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const } for (size_t out_idx = 0; out_idx < output_length; ++out_idx) { - // 计算池化窗口范围 [start_index, end_index) - // 公式参考 PyTorch: - // start = floor(out_idx * L_in / L_out) - // end = ceil((out_idx + 1) * L_in / L_out) - int start_index = std::floor((float)out_idx * input_length / output_length); - int end_index = std::ceil((float)(out_idx + 1) * input_length / output_length); - - start_index = std::max(start_index, 0); - end_index = std::min(end_index, (int)input_length); - int window_len = end_index - start_index; + size_t start_index = (out_idx * input_length) / output_length; + size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length; + + start_index = std::max(start_index, size_t(0)); + end_index = std::min(end_index, input_length); + size_t window_len = end_index - start_index; if (window_len <= 0) { continue; From d6af9c90f81a949ff5dc72bc19bdfd58a789ce83 Mon Sep 17 00:00:00 2001 From: PanZezhong1725 <141193946+PanZezhong1725@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:16:21 +0800 Subject: [PATCH 5/6] issue/1031 T1-1-17 --- include/infinicore/ops.hpp | 4 + include/infinicore/ops/avg_pool1d.hpp | 18 ++ include/infinicore/ops/cross_entropy.hpp | 35 +++ include/infinicore/ops/equal.hpp | 19 ++ include/infinicore/ops/hardswish.hpp | 18 ++ include/infinicore/ops/hardtanh.hpp | 18 ++ include/infiniop.h | 6 + include/infiniop/ops/avg_pool1d.h | 32 +++ include/infiniop/ops/cross_entropy.h | 31 +++ include/infiniop/ops/equal.h | 31 +++ include/infiniop/ops/hardswish.h | 29 +++ include/infiniop/ops/hardtanh.h | 27 +++ python/infinicore/__init__.py | 4 + python/infinicore/nn/functional/__init__.py | 6 + python/infinicore/nn/functional/avg_pool1d.py | 24 ++ python/infinicore/nn/functional/hardswish.py | 28 +++ python/infinicore/nn/functional/hardtanh.py | 46 ++++ python/infinicore/ops/cross_entropy.py | 33 +++ python/infinicore/ops/equal.py | 10 + python/infinicore/utils.py | 17 +- scripts/python_test.py | 9 +- src/infinicore/ops/avg_pool1d/avg_pool1d.cc | 68 ++++++ .../ops/avg_pool1d/avg_pool1d_infiniop.cc | 69 ++++++ .../ops/cross_entropy/cross_entropy.cc | 45 ++++ .../cross_entropy/cross_entropy_infiniop.cc | 64 +++++ src/infinicore/ops/equal/equal.cc | 31 +++ src/infinicore/ops/equal/equal_infiniop.cc | 57 +++++ src/infinicore/ops/hardswish/hardswish.cc | 38 +++ .../ops/hardswish/hardswish_infiniop.cc | 61 +++++ src/infinicore/ops/hardtanh/hardtanh.cc | 38 +++ .../ops/hardtanh/hardtanh_infiniop.cc | 63 +++++ src/infinicore/pybind11/ops.hpp | 10 + src/infinicore/pybind11/ops/avg_pool1d.hpp | 37 +++ src/infinicore/pybind11/ops/cross_entropy.hpp | 26 ++ src/infinicore/pybind11/ops/equal.hpp | 26 ++ src/infinicore/pybind11/ops/hardswish.hpp | 24 ++ src/infinicore/pybind11/ops/hardtanh.hpp | 28 +++ src/infiniop/ops/avg_pool1d/avg_pool1d.h | 103 ++++++++ .../ops/avg_pool1d/cpu/avg_pool1d_cpu.cc | 96 ++++++++ .../ops/avg_pool1d/cpu/avg_pool1d_cpu.h | 8 + src/infiniop/ops/avg_pool1d/cuda/kernel.cuh | 58 +++++ .../ops/avg_pool1d/metax/avg_pool1d_metax.h | 8 + .../avg_pool1d/metax/avg_pool1d_metax.maca | 170 +++++++++++++ .../ops/avg_pool1d/moore/avg_pool1d_kernel.h | 72 ++++++ .../ops/avg_pool1d/moore/avg_pool1d_moore.h | 8 + .../ops/avg_pool1d/moore/avg_pool1d_moore.mu | 135 +++++++++++ .../avg_pool1d/nvidia/avg_pool1d_nvidia.cu | 126 ++++++++++ .../avg_pool1d/nvidia/avg_pool1d_nvidia.cuh | 8 + src/infiniop/ops/avg_pool1d/operator.cc | 225 ++++++++++++++++++ .../cross_entropy/cpu/cross_entropy_cpu.cc | 99 ++++++++ .../ops/cross_entropy/cpu/cross_entropy_cpu.h | 8 + .../ops/cross_entropy/cross_entropy.h | 42 ++++ .../ops/cross_entropy/cuda/kernel.cuh | 80 +++++++ src/infiniop/ops/cross_entropy/info.h | 17 ++ .../cross_entropy/metax/cross_entropy_metax.h | 8 + .../metax/cross_entropy_metax.maca | 188 +++++++++++++++ .../moore/cross_entropy_kernel.h | 53 +++++ .../cross_entropy/moore/cross_entropy_moore.h | 8 + .../moore/cross_entropy_moore.mu | 129 ++++++++++ .../nvidia/cross_entropy_nvidia.cu | 107 +++++++++ .../nvidia/cross_entropy_nvidia.cuh | 8 + src/infiniop/ops/cross_entropy/operator.cc | 174 ++++++++++++++ src/infiniop/ops/equal/cpu/equal_cpu.cc | 68 ++++++ src/infiniop/ops/equal/cpu/equal_cpu.h | 28 +++ src/infiniop/ops/equal/cuda/kernel.cuh | 37 +++ src/infiniop/ops/equal/metax/equal_metax.h | 8 + src/infiniop/ops/equal/metax/equal_metax.maca | 69 ++++++ src/infiniop/ops/equal/moore/equal_moore.h | 8 + src/infiniop/ops/equal/moore/equal_moore.mu | 140 +++++++++++ .../ops/equal/moore/equal_moore_kernel.h | 30 +++ src/infiniop/ops/equal/nvidia/equal_nvidia.cu | 137 +++++++++++ .../ops/equal/nvidia/equal_nvidia.cuh | 8 + src/infiniop/ops/equal/operator.cc | 201 ++++++++++++++++ .../ops/hardswish/cpu/hardswish_cpu.cc | 91 +++++++ .../ops/hardswish/cpu/hardswish_cpu.h | 50 ++++ src/infiniop/ops/hardswish/cuda/kernel.cuh | 86 +++++++ .../ops/hardswish/metax/hardswish_metax.h | 8 + .../ops/hardswish/metax/hardswish_metax.maca | 58 +++++ .../ops/hardswish/moore/hardswish_moore.h | 8 + .../ops/hardswish/moore/hardswish_moore.mu | 118 +++++++++ .../hardswish/moore/hardswish_moore_kernel.h | 39 +++ .../ops/hardswish/nvidia/hardswish_nvidia.cu | 115 +++++++++ .../ops/hardswish/nvidia/hardswish_nvidia.cuh | 8 + src/infiniop/ops/hardswish/operator.cc | 157 ++++++++++++ src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc | 124 ++++++++++ src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h | 63 +++++ src/infiniop/ops/hardtanh/cuda/kernel.cuh | 51 ++++ .../ops/hardtanh/metax/hardtanh_metax.h | 48 ++++ .../ops/hardtanh/metax/hardtanh_metax.maca | 95 ++++++++ .../ops/hardtanh/moore/hardtanh_moore.h | 51 ++++ .../ops/hardtanh/moore/hardtanh_moore.mu | 158 ++++++++++++ .../hardtanh/moore/hardtanh_moore_kernel.h | 34 +++ .../ops/hardtanh/nvidia/hardtanh_nvidia.cu | 150 ++++++++++++ .../ops/hardtanh/nvidia/hardtanh_nvidia.cuh | 51 ++++ src/infiniop/ops/hardtanh/operator.cc | 161 +++++++++++++ src/utils/custom_types.h | 16 ++ test/infinicore/ops/avg_pool1d.py | 5 +- test/infinicore/ops/cross_entropy.py | 19 +- test/infinicore/ops/equal.py | 15 +- test/infinicore/ops/hardswish.py | 5 +- test/infinicore/ops/hardtanh.py | 9 +- test/infiniop/avg_pool1d.py | 183 ++++++++++++++ test/infiniop/cross_entropy.py | 106 +++++++++ test/infiniop/equal.py | 181 ++++++++++++++ test/infiniop/hardswish.py | 171 +++++++++++++ test/infiniop/hardtanh.py | 169 +++++++++++++ test/infiniop/libinfiniop/op_register.py | 188 +++++++++++++++ test/infiniop/libinfiniop/utils.py | 8 +- 108 files changed, 6572 insertions(+), 29 deletions(-) create mode 100644 include/infinicore/ops/avg_pool1d.hpp create mode 100644 include/infinicore/ops/cross_entropy.hpp create mode 100644 include/infinicore/ops/equal.hpp create mode 100644 include/infinicore/ops/hardswish.hpp create mode 100644 include/infinicore/ops/hardtanh.hpp create mode 100644 include/infiniop/ops/avg_pool1d.h create mode 100644 include/infiniop/ops/cross_entropy.h create mode 100644 include/infiniop/ops/equal.h create mode 100644 include/infiniop/ops/hardswish.h create mode 100644 include/infiniop/ops/hardtanh.h create mode 100644 python/infinicore/nn/functional/avg_pool1d.py create mode 100644 python/infinicore/nn/functional/hardswish.py create mode 100644 python/infinicore/nn/functional/hardtanh.py create mode 100644 python/infinicore/ops/cross_entropy.py create mode 100644 python/infinicore/ops/equal.py create mode 100644 src/infinicore/ops/avg_pool1d/avg_pool1d.cc create mode 100644 src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc create mode 100644 src/infinicore/ops/cross_entropy/cross_entropy.cc create mode 100644 src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc create mode 100644 src/infinicore/ops/equal/equal.cc create mode 100644 src/infinicore/ops/equal/equal_infiniop.cc create mode 100644 src/infinicore/ops/hardswish/hardswish.cc create mode 100644 src/infinicore/ops/hardswish/hardswish_infiniop.cc create mode 100644 src/infinicore/ops/hardtanh/hardtanh.cc create mode 100644 src/infinicore/ops/hardtanh/hardtanh_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/avg_pool1d.hpp create mode 100644 src/infinicore/pybind11/ops/cross_entropy.hpp create mode 100644 src/infinicore/pybind11/ops/equal.hpp create mode 100644 src/infinicore/pybind11/ops/hardswish.hpp create mode 100644 src/infinicore/pybind11/ops/hardtanh.hpp create mode 100644 src/infiniop/ops/avg_pool1d/avg_pool1d.h create mode 100644 src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc create mode 100644 src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h create mode 100644 src/infiniop/ops/avg_pool1d/cuda/kernel.cuh create mode 100644 src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h create mode 100644 src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca create mode 100644 src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h create mode 100644 src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h create mode 100644 src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu create mode 100644 src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu create mode 100644 src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh create mode 100644 src/infiniop/ops/avg_pool1d/operator.cc create mode 100644 src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc create mode 100644 src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h create mode 100644 src/infiniop/ops/cross_entropy/cross_entropy.h create mode 100644 src/infiniop/ops/cross_entropy/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cross_entropy/info.h create mode 100644 src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h create mode 100644 src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca create mode 100644 src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h create mode 100644 src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h create mode 100644 src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu create mode 100644 src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu create mode 100644 src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh create mode 100644 src/infiniop/ops/cross_entropy/operator.cc create mode 100644 src/infiniop/ops/equal/cpu/equal_cpu.cc create mode 100644 src/infiniop/ops/equal/cpu/equal_cpu.h create mode 100644 src/infiniop/ops/equal/cuda/kernel.cuh create mode 100644 src/infiniop/ops/equal/metax/equal_metax.h create mode 100644 src/infiniop/ops/equal/metax/equal_metax.maca create mode 100644 src/infiniop/ops/equal/moore/equal_moore.h create mode 100644 src/infiniop/ops/equal/moore/equal_moore.mu create mode 100644 src/infiniop/ops/equal/moore/equal_moore_kernel.h create mode 100644 src/infiniop/ops/equal/nvidia/equal_nvidia.cu create mode 100644 src/infiniop/ops/equal/nvidia/equal_nvidia.cuh create mode 100644 src/infiniop/ops/equal/operator.cc create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.h create mode 100644 src/infiniop/ops/hardswish/cuda/kernel.cuh create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.h create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.maca create mode 100644 src/infiniop/ops/hardswish/moore/hardswish_moore.h create mode 100644 src/infiniop/ops/hardswish/moore/hardswish_moore.mu create mode 100644 src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh create mode 100644 src/infiniop/ops/hardswish/operator.cc create mode 100644 src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc create mode 100644 src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h create mode 100644 src/infiniop/ops/hardtanh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/hardtanh/metax/hardtanh_metax.h create mode 100644 src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca create mode 100644 src/infiniop/ops/hardtanh/moore/hardtanh_moore.h create mode 100644 src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu create mode 100644 src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h create mode 100644 src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu create mode 100644 src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh create mode 100644 src/infiniop/ops/hardtanh/operator.cc create mode 100644 test/infiniop/avg_pool1d.py create mode 100644 test/infiniop/cross_entropy.py create mode 100644 test/infiniop/equal.py create mode 100644 test/infiniop/hardswish.py create mode 100644 test/infiniop/hardtanh.py diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 5274dde48..e3ca88797 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -5,12 +5,16 @@ #include "ops/add_rms_norm.hpp" #include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/avg_pool1d.hpp" #include "ops/baddbmm.hpp" #include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" +#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" #include "ops/fmod.hpp" +#include "ops/hardswish.hpp" +#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" diff --git a/include/infinicore/ops/avg_pool1d.hpp b/include/infinicore/ops/avg_pool1d.hpp new file mode 100644 index 000000000..4bf69bc2a --- /dev/null +++ b/include/infinicore/ops/avg_pool1d.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class AvgPool1d { +public: + using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t); + static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding); + static common::OpDispatcher &dispatcher(); +}; + +Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); +void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp new file mode 100644 index 000000000..958ee1089 --- /dev/null +++ b/include/infinicore/ops/cross_entropy.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class CrossEntropy { +public: + // Schema 定义:函数指针类型 + // CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels) + using schema = void (*)(Tensor, Tensor, Tensor); + + // 执行入口 + static void execute(Tensor output, Tensor input, Tensor target); + + // 分发器访问接口 + static common::OpDispatcher &dispatcher(); +}; + +// ================================================================== +// 对外 Functional API +// ================================================================== + +// 1. Out-of-place 接口: +// 输入 Logits 和 Target,内部自动创建 Output Tensor 并返回 +Tensor cross_entropy(Tensor input, Tensor target); + +// 2. Explicit Output 接口 (类似于 In-place 风格): +// 用户显式提供 Output Tensor 用于存储结果 +// 注意:虽然命名带有下划线 _,但通常 CrossEntropy 无法真正原地修改 input, +// 所以这里只是表示“写入指定的 output 内存” +void cross_entropy_(Tensor output, Tensor input, Tensor target); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/equal.hpp b/include/infinicore/ops/equal.hpp new file mode 100644 index 000000000..1a158bf1e --- /dev/null +++ b/include/infinicore/ops/equal.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Equal { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + + static void execute(Tensor out, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor equal(Tensor a, Tensor b); +void equal_(Tensor out, Tensor a, Tensor b); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/hardswish.hpp b/include/infinicore/ops/hardswish.hpp new file mode 100644 index 000000000..15313f461 --- /dev/null +++ b/include/infinicore/ops/hardswish.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Hardswish { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor hardswish(Tensor input); +void hardswish_(Tensor output, Tensor input); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/hardtanh.hpp b/include/infinicore/ops/hardtanh.hpp new file mode 100644 index 000000000..511408fee --- /dev/null +++ b/include/infinicore/ops/hardtanh.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class HardTanh { +public: + using schema = void (*)(Tensor, Tensor, float, float); + static void execute(Tensor output, Tensor input, float min_val, float max_val); + static common::OpDispatcher &dispatcher(); +}; + +Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f); +void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 4217183f7..a73bd20a0 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -45,4 +45,10 @@ #include "infiniop/ops/zeros.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/cross_entropy.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/avg_pool1d.h" +#include "infiniop/ops/equal.h" +#include "infiniop/ops/hardtanh.h" + #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h new file mode 100644 index 000000000..5c0bdf6ea --- /dev/null +++ b/include/infiniop/ops/avg_pool1d.h @@ -0,0 +1,32 @@ +#ifndef __INFINIOP_AVG_POOL1D_API_H__ +#define __INFINIOP_AVG_POOL1D_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor( + infiniopHandle_t handle, + infiniopAvgPool1dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + size_t kernel_size, + size_t stride, + size_t padding); + +__INFINI_C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize( + infiniopAvgPool1dDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopAvgPool1d( + infiniopAvgPool1dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor( + infiniopAvgPool1dDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h new file mode 100644 index 000000000..6c9c2a773 --- /dev/null +++ b/include/infiniop/ops/cross_entropy.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_API_H__ +#define __INFINIOP_CROSS_ENTROPY_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc); + +__INFINI_C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( + infiniopCrossEntropyDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopCrossEntropy( + infiniopCrossEntropyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor( + infiniopCrossEntropyDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h new file mode 100644 index 000000000..5476f754c --- /dev/null +++ b/include/infiniop/ops/equal.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__INFINI_C __export infiniStatus_t infiniopGetEqualWorkspaceSize( + infiniopEqualDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor( + infiniopEqualDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..ba5b43b77 --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__INFINI_C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize( + infiniopHardSwishDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor( + infiniopHardSwishDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h new file mode 100644 index 000000000..62f6435aa --- /dev/null +++ b/include/infiniop/ops/hardtanh.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_HARDTANH_API_H__ +#define __INFINIOP_HARDTANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardTanhDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateHardTanhDescriptor(infiniopHandle_t handle, + infiniopHardTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min_val, + float max_val); + +__INFINI_C __export infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 46249178d..6fc3e5c00 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -53,6 +53,8 @@ from infinicore.ops.attention import attention from infinicore.ops.baddbmm import baddbmm from infinicore.ops.bilinear import bilinear +from infinicore.ops.cross_entropy import cross_entropy +from infinicore.ops.equal import equal from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching from infinicore.ops.matmul import matmul @@ -132,11 +134,13 @@ "bilinear", "fmod", "matmul", + "equal", "mul", "narrow", "squeeze", "unsqueeze", "rearrange", + "cross_entropy", "empty", "empty_like", "from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index f81b61262..a8da2dfca 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,7 +1,10 @@ from .adaptive_max_pool1d import adaptive_max_pool1d +from .avg_pool1d import avg_pool1d from .causal_softmax import causal_softmax from .embedding import embedding from .flash_attention import flash_attention +from .hardswish import hardswish +from .hardtanh import hardtanh from .linear import linear from .linear_w8a8i8 import linear_w8a8i8 from .random_sample import random_sample @@ -22,6 +25,9 @@ "RopeAlgo", "rope", "silu", + "hardswish", + "hardtanh", + "avg_pool1d", "swiglu", "linear_w8a8i8", "silu_and_mul", diff --git a/python/infinicore/nn/functional/avg_pool1d.py b/python/infinicore/nn/functional/avg_pool1d.py new file mode 100644 index 000000000..0cf4759ad --- /dev/null +++ b/python/infinicore/nn/functional/avg_pool1d.py @@ -0,0 +1,24 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def avg_pool1d( + input: Tensor, + kernel_size: int, + stride: int | None = None, + padding: int = 0, + *, + out=None, +) -> Tensor: + if stride is None: + stride = 0 + + if out is None: + return Tensor( + _infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding) + ) + + _infinicore.avg_pool1d_( + out._underlying, input._underlying, kernel_size, stride, padding + ) + return out diff --git a/python/infinicore/nn/functional/hardswish.py b/python/infinicore/nn/functional/hardswish.py new file mode 100644 index 000000000..b054b8978 --- /dev/null +++ b/python/infinicore/nn/functional/hardswish.py @@ -0,0 +1,28 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor: + r"""Apply the Hardswish activation function element-wise.""" + + if ( + infinicore.use_ntops + and input.device.type in ("cuda", "musa") + and out is None + and hasattr(infinicore.ntops.torch, "hardswish") + ): + try: + return infinicore.ntops.torch.hardswish(input, inplace=inplace) + except AttributeError: + pass + + if inplace: + _infinicore.hardswish_(input._underlying, input._underlying) + return input + + if out is None: + return Tensor(_infinicore.hardswish(input._underlying)) + + _infinicore.hardswish_(out._underlying, input._underlying) + return out diff --git a/python/infinicore/nn/functional/hardtanh.py b/python/infinicore/nn/functional/hardtanh.py new file mode 100644 index 000000000..925de33d6 --- /dev/null +++ b/python/infinicore/nn/functional/hardtanh.py @@ -0,0 +1,46 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def hardtanh( + input: Tensor, + min_val: float = -1.0, + max_val: float = 1.0, + inplace: bool = False, + *, + out=None, +) -> Tensor: + """Clamp the input tensor to the range [min_val, max_val].""" + + if min_val > max_val: + raise ValueError("min_val must be less than or equal to max_val") + + if ( + infinicore.use_ntops + and input.device.type in ("cuda", "musa") + and out is None + and hasattr(infinicore.ntops.torch, "hardtanh") + ): + try: + return infinicore.ntops.torch.hardtanh( + input, min_val=min_val, max_val=max_val, inplace=inplace + ) + except AttributeError: + pass + + if inplace: + _infinicore.hardtanh_( + input._underlying, input._underlying, float(min_val), float(max_val) + ) + return input + + if out is None: + return Tensor( + _infinicore.hardtanh(input._underlying, float(min_val), float(max_val)) + ) + + _infinicore.hardtanh_( + out._underlying, input._underlying, float(min_val), float(max_val) + ) + return out diff --git a/python/infinicore/ops/cross_entropy.py b/python/infinicore/ops/cross_entropy.py new file mode 100644 index 000000000..5b47697b5 --- /dev/null +++ b/python/infinicore/ops/cross_entropy.py @@ -0,0 +1,33 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def cross_entropy( + logits, + target, + weight=None, + *, + ignore_index=None, + reduction="none", + out=None, +): + """ + Token-wise cross entropy without reduction. The output tensor has the same + shape as target and uses the logits dtype. + """ + if weight is not None: + raise NotImplementedError("class weights are not supported yet.") + if ignore_index is not None: + raise NotImplementedError("ignore_index is not supported yet.") + if reduction not in (None, "none"): + raise NotImplementedError("Only reduction='none' is implemented.") + + if out is None: + return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying)) + + _infinicore.cross_entropy_( + out._underlying, + logits._underlying, + target._underlying, + ) + return out diff --git a/python/infinicore/ops/equal.py b/python/infinicore/ops/equal.py new file mode 100644 index 000000000..5a656ab30 --- /dev/null +++ b/python/infinicore/ops/equal.py @@ -0,0 +1,10 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def equal(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.equal(input._underlying, other._underlying)) + + _infinicore.equal_(out._underlying, input._underlying, other._underlying) + return out diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py index 094b2230e..e0019dc89 100644 --- a/python/infinicore/utils.py +++ b/python/infinicore/utils.py @@ -1,9 +1,13 @@ -import ml_dtypes import numpy as np import torch import infinicore +try: + import ml_dtypes +except ModuleNotFoundError: + ml_dtypes = None + def to_torch_dtype(infini_dtype): """Convert infinicore data type to PyTorch data type""" @@ -57,7 +61,9 @@ def numpy_to_infinicore_dtype(numpy_dtype): return infinicore.float64 elif numpy_dtype == np.float16: return infinicore.float16 - elif numpy_dtype == ml_dtypes.bfloat16: + elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16: + return infinicore.bfloat16 + elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16: return infinicore.bfloat16 elif numpy_dtype == np.int8: return infinicore.int8 @@ -86,6 +92,13 @@ def infinicore_to_numpy_dtype(infini_dtype): elif infini_dtype == infinicore.int16: return np.int16 elif infini_dtype == infinicore.bfloat16: + if hasattr(np, "bfloat16"): + return np.bfloat16 + if ml_dtypes is None: + raise ModuleNotFoundError( + "ml_dtypes is required for bfloat16 numpy conversion. " + "Please install ml_dtypes." + ) return ml_dtypes.bfloat16 elif infini_dtype == infinicore.int32: return np.int32 diff --git a/scripts/python_test.py b/scripts/python_test.py index 0bd8bc26d..13b69a013 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -17,12 +17,12 @@ def run_tests(args): "causal_softmax.py", "clip.py", "conv.py", - #"dequantize_awq.py", + # "dequantize_awq.py", "gelu.py", "gemm.py", - #"layer_norm.py", + # "layer_norm.py", "logsoftmax.py", - #"lp_norm.py", + # "lp_norm.py", "mul.py", "ones.py", "random_sample.py", @@ -31,7 +31,7 @@ def run_tests(args): "rms_norm.py", "rope.py", "sigmoid.py", - #"softmax.py", + # "softmax.py", "softplus.py", "sub.py", "swiglu.py", @@ -42,6 +42,7 @@ def run_tests(args): # "paged_attention.py", # "paged_caching.py", # "paged_attention_prefill.py" + "cross_entropy.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc new file mode 100644 index 000000000..907b25b00 --- /dev/null +++ b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc @@ -0,0 +1,68 @@ +#include "infinicore/ops/avg_pool1d.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &AvgPool1d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void AvgPool1d::execute( + Tensor output, + Tensor input, + size_t kernel_size, + size_t stride, + size_t padding) { + + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + if (stride == 0) { + stride = kernel_size; + } + + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No AvgPool1d implementation for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, kernel_size, stride, padding); +} + +Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) { + if (stride == 0) { + stride = kernel_size; + } + + const auto &shape = input->shape(); + if (shape.size() != 3) { + throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]"); + } + + const size_t n = shape[0]; + const size_t c = shape[1]; + const size_t l_in = shape[2]; + + if (l_in + 2 * padding < kernel_size) { + throw std::runtime_error("AvgPool1d kernel_size is larger than padded length"); + } + + const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1; + + Shape out_shape = {n, c, out_width}; + auto output = Tensor::empty(out_shape, input->dtype(), input->device()); + avg_pool1d_(output, input, kernel_size, stride, padding); + return output; +} + +void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) { + AvgPool1d::execute(output, input, kernel_size, stride, padding); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc new file mode 100644 index 000000000..df7ebda8d --- /dev/null +++ b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/avg_pool1d.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::avg_pool1d_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopAvgPool1dDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate( + Tensor output, + Tensor input, + size_t kernel_size, + size_t stride, + size_t padding) { + + if (stride == 0) { + stride = kernel_size; + } + + size_t seed = hash_combine(output, input, kernel_size, stride, padding); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopAvgPool1dDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + kernel_size, + stride, + padding)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAvgPool1d( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + AvgPool1d::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::avg_pool1d_impl::infiniop diff --git a/src/infinicore/ops/cross_entropy/cross_entropy.cc b/src/infinicore/ops/cross_entropy/cross_entropy.cc new file mode 100644 index 000000000..9804d5377 --- /dev/null +++ b/src/infinicore/ops/cross_entropy/cross_entropy.cc @@ -0,0 +1,45 @@ +#include "infinicore/ops/cross_entropy.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &CrossEntropy::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) { + + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target); + + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, target); +} + +Tensor cross_entropy(Tensor input, Tensor target) { + + Shape shape = target->shape(); + + auto output = Tensor::empty(shape, input->dtype(), input->device()); + + cross_entropy_(output, input, target); + return output; +} + +void cross_entropy_(Tensor output, Tensor input, Tensor target) { + CrossEntropy::execute(output, input, target); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc new file mode 100644 index 000000000..d02f16da6 --- /dev/null +++ b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc @@ -0,0 +1,64 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" + +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/cross_entropy.hpp" + +#include + +namespace infinicore::op::cross_entropy_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopCrossEntropyDescriptor_t &desc) { + if (desc != nullptr) { + + INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor target) { + + size_t seed = hash_combine(output, input, target); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopCrossEntropyDescriptor_t desc = nullptr; + + if (!desc_opt) { + + INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + target->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size)); + + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopCrossEntropy( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + target->data(), + context::getStream())); +} + +static bool registered = []() { + CrossEntropy::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::cross_entropy_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/equal/equal.cc b/src/infinicore/ops/equal/equal.cc new file mode 100644 index 000000000..b6acc4d25 --- /dev/null +++ b/src/infinicore/ops/equal/equal.cc @@ -0,0 +1,31 @@ +#include "infinicore/ops/equal.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Equal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Equal::execute(Tensor out, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b); + infinicore::context::setDevice(out->device()); + dispatcher().lookup(out->device().getType())(out, a, b); +} + +Tensor equal(Tensor a, Tensor b) { + auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device()); + equal_(out, a, b); + return out; +} + +void equal_(Tensor out, Tensor a, Tensor b) { + if (out->dtype() != DataType::BOOL) { + throw std::runtime_error("Equal expects bool output tensor."); + } + Equal::execute(out, a, b); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/equal/equal_infiniop.cc b/src/infinicore/ops/equal/equal_infiniop.cc new file mode 100644 index 000000000..1b4e4cffa --- /dev/null +++ b/src/infinicore/ops/equal/equal_infiniop.cc @@ -0,0 +1,57 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/equal.hpp" +#include + +namespace infinicore::op::equal_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopEqualDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor a, Tensor b) { + size_t seed = hash_combine(out, a, b); + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + infiniopEqualDescriptor_t desc = nullptr; + if (auto cached = cache.get(seed)) { + desc = *cached; + } else { + INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor( + context::getInfiniopHandle(device), &desc, + out->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopEqual( + desc, + workspace_ptr, + workspace_size, + out->data(), + a->data(), + b->data(), + context::getStream())); +} + +static bool registered = []() { + Equal::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::equal_impl::infiniop diff --git a/src/infinicore/ops/hardswish/hardswish.cc b/src/infinicore/ops/hardswish/hardswish.cc new file mode 100644 index 000000000..ec8db75ff --- /dev/null +++ b/src/infinicore/ops/hardswish/hardswish.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/hardswish.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Hardswish::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Hardswish::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No Hardswish implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor hardswish(Tensor input) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + hardswish_(output, input); + return output; +} + +void hardswish_(Tensor output, Tensor input) { + Hardswish::execute(output, input); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/hardswish/hardswish_infiniop.cc b/src/infinicore/ops/hardswish/hardswish_infiniop.cc new file mode 100644 index 000000000..44d4054e8 --- /dev/null +++ b/src/infinicore/ops/hardswish/hardswish_infiniop.cc @@ -0,0 +1,61 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/hardswish.hpp" +#include + +namespace infinicore::op::hardswish_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopHardSwishDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopHardSwishDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopHardSwish( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + Hardswish::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::hardswish_impl::infiniop diff --git a/src/infinicore/ops/hardtanh/hardtanh.cc b/src/infinicore/ops/hardtanh/hardtanh.cc new file mode 100644 index 000000000..5a4df2142 --- /dev/null +++ b/src/infinicore/ops/hardtanh/hardtanh.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/hardtanh.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &HardTanh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error( + "No HardTanh implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, min_val, max_val); +} + +Tensor hardtanh(Tensor input, float min_val, float max_val) { + auto output = Tensor::empty(input->shape(), input->dtype(), input->device()); + hardtanh_(output, input, min_val, max_val); + return output; +} + +void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) { + HardTanh::execute(output, input, min_val, max_val); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc new file mode 100644 index 000000000..d8af439d8 --- /dev/null +++ b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc @@ -0,0 +1,63 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/hardtanh.hpp" +#include + +namespace infinicore::op::hardtanh_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopHardTanhDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, float min_val, float max_val) { + size_t seed = hash_combine(output, input, min_val, max_val); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopHardTanhDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor( + context::getInfiniopHandle(device), + &desc, + output->desc(), + input->desc(), + min_val, + max_val)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace; + void *workspace_ptr = nullptr; + if (workspace_size != 0) { + workspace = context::allocateMemory(workspace_size); + workspace_ptr = workspace->data(); + } + + INFINICORE_CHECK_ERROR(infiniopHardTanh( + desc, + workspace_ptr, + workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + HardTanh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::hardtanh_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 1d0ace555..8a83c02f8 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,12 +7,17 @@ #include "ops/add_rms_norm.hpp" #include "ops/asinh.hpp" #include "ops/attention.hpp" +#include "ops/avg_pool1d.hpp" #include "ops/baddbmm.hpp" #include "ops/bilinear.hpp" #include "ops/causal_softmax.hpp" +#include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" +#include "ops/equal.hpp" #include "ops/flash_attention.hpp" #include "ops/fmod.hpp" +#include "ops/hardswish.hpp" +#include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" @@ -51,18 +56,23 @@ inline void bind(py::module &m) { bind_matmul(m); bind_mul(m); bind_mha_varlen(m); + bind_hardswish(m); + bind_hardtanh(m); bind_paged_attention(m); bind_paged_attention_prefill(m); bind_paged_caching(m); bind_random_sample(m); + bind_cross_entropy(m); bind_rearrange(m); bind_rms_norm(m); + bind_avg_pool1d(m); bind_silu(m); bind_swiglu(m); bind_rope(m); bind_embedding(m); bind_linear_w8a8i8(m); bind_silu_and_mul(m); + bind_equal(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/avg_pool1d.hpp b/src/infinicore/pybind11/ops/avg_pool1d.hpp new file mode 100644 index 000000000..32394552a --- /dev/null +++ b/src/infinicore/pybind11/ops/avg_pool1d.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "infinicore/ops/avg_pool1d.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_avg_pool1d(py::module &m) { + m.def( + "avg_pool1d", + [](::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { + return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding); + }, + py::arg("input"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("padding") = 0, + R"doc(AvgPool1d out-of-place.)doc"); + + m.def( + "avg_pool1d_", + [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional stride, size_t padding) { + op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding); + }, + py::arg("output"), + py::arg("input"), + py::arg("kernel_size"), + py::arg("stride") = py::none(), + py::arg("padding") = 0, + R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/cross_entropy.hpp b/src/infinicore/pybind11/ops/cross_entropy.hpp new file mode 100644 index 000000000..8105642a6 --- /dev/null +++ b/src/infinicore/pybind11/ops/cross_entropy.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/cross_entropy.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cross_entropy(py::module &m) { + m.def("cross_entropy", + &op::cross_entropy, + py::arg("logits"), + py::arg("target"), + R"doc(Token-wise cross entropy loss without reduction.)doc"); + + m.def("cross_entropy_", + &op::cross_entropy_, + py::arg("loss"), + py::arg("logits"), + py::arg("target"), + R"doc(Write cross entropy loss into a provided tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/equal.hpp b/src/infinicore/pybind11/ops/equal.hpp new file mode 100644 index 000000000..d14a6b61d --- /dev/null +++ b/src/infinicore/pybind11/ops/equal.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/equal.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_equal(py::module &m) { + m.def("equal", + &op::equal, + py::arg("a"), + py::arg("b"), + R"doc(Elementwise equality returning a bool tensor.)doc"); + + m.def("equal_", + &op::equal_, + py::arg("out"), + py::arg("a"), + py::arg("b"), + R"doc(In-place elementwise equality writing into `out`.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardswish.hpp b/src/infinicore/pybind11/ops/hardswish.hpp new file mode 100644 index 000000000..daaccec62 --- /dev/null +++ b/src/infinicore/pybind11/ops/hardswish.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/hardswish.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_hardswish(py::module &m) { + m.def("hardswish", + &op::hardswish, + py::arg("input"), + R"doc(Out-of-place Hardswish activation.)doc"); + + m.def("hardswish_", + &op::hardswish_, + py::arg("output"), + py::arg("input"), + R"doc(In-place Hardswish activation.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/hardtanh.hpp b/src/infinicore/pybind11/ops/hardtanh.hpp new file mode 100644 index 000000000..ff9abb872 --- /dev/null +++ b/src/infinicore/pybind11/ops/hardtanh.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +#include "infinicore/ops/hardtanh.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_hardtanh(py::module &m) { + m.def("hardtanh", + &op::hardtanh, + py::arg("input"), + py::arg("min_val") = -1.0f, + py::arg("max_val") = 1.0f, + R"doc(Apply the HardTanh activation.)doc"); + + m.def("hardtanh_", + &op::hardtanh_, + py::arg("output"), + py::arg("input"), + py::arg("min_val") = -1.0f, + py::arg("max_val") = 1.0f, + R"doc(In-place HardTanh activation.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/avg_pool1d/avg_pool1d.h b/src/infiniop/ops/avg_pool1d/avg_pool1d.h new file mode 100644 index 000000000..fae5f445b --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/avg_pool1d.h @@ -0,0 +1,103 @@ +#ifndef __AVG_POOL1D_H__ +#define __AVG_POOL1D_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/avg_pool1d.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::avg_pool1d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AvgPool1dInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AvgPool1dInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t kernel_size, \ + size_t stride, \ + size_t padding); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class AvgPool1dInfo { +private: + AvgPool1dInfo() = default; + +public: + infiniDtype_t dtype; + size_t batch, channels, in_width, out_width; + size_t kernel_size, stride, padding; + + ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width; + ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width; + + static utils::Result createAvgPool1dInfo( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = y_desc->dtype(); + CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t batch = x_desc->dim(0); + size_t channels = x_desc->dim(1); + size_t in_width = x_desc->dim(2); + + CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t padded_len = in_width + 2 * padding; + + CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t expected_out_width = (padded_len - kernel_size) / stride + 1; + CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE); + + size_t out_width = expected_out_width; + + return utils::Result(AvgPool1dInfo{ + dtype, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), + x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)}); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc new file mode 100644 index 000000000..67e5b6623 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc @@ -0,0 +1,96 @@ +#include "avg_pool1d_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::avg_pool1d::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info, + T *y, + const T *x) { + const float inv_kernel = 1.0f / static_cast(info.kernel_size); + +#pragma omp parallel for + for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) { + + ptrdiff_t b = bc / info.channels; + ptrdiff_t c = bc % info.channels; + + size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel; + size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel; + + for (size_t ow = 0; ow < info.out_width; ++ow) { + size_t y_offset = y_base + ow * info.y_stride_width; + + long long start_w = static_cast(ow * info.stride) - info.padding; + long long end_w = start_w + info.kernel_size; + + long long valid_start = std::max(0LL, start_w); + long long valid_end = std::min(static_cast(info.in_width), end_w); + + float sum = 0.0f; + for (long long iw = valid_start; iw < valid_end; ++iw) { + size_t x_offset = x_base + iw * info.x_stride_width; + sum += utils::cast(x[x_offset]); + } + + const float avg = sum * inv_kernel; + y[y_offset] = utils::cast(avg); + } + } + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(fp16_t); + case INFINI_DTYPE_BF16: + return CALCULATE(bf16_t); + case INFINI_DTYPE_F32: + return CALCULATE(float); + case INFINI_DTYPE_F64: + return CALCULATE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::cpu diff --git a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h new file mode 100644 index 000000000..2335733db --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_CPU_H__ +#define __INFINIOP_AVG_POOL1D_CPU_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh new file mode 100644 index 000000000..36a11acfc --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh @@ -0,0 +1,58 @@ +#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ +#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__ + +template +__device__ void avgPool1dKernel( + T *y, + const T *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + + long long start_w = static_cast(ow * stride) - padding; + + T sum = 0; + + for (size_t k = 0; k < kernel_size; ++k) { + long long iw = start_w + k; + + if (iw >= 0 && iw < static_cast(in_width)) { + size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width; + sum += x[x_offset]; + } + } + +#if defined(ENABLE_ILUVATAR_API) + // Iluvatar __half doesn't accept size_t directly. + y[y_offset] = sum / static_cast(static_cast(kernel_size)); +#else + y[y_offset] = sum / static_cast(kernel_size); +#endif + } +} + +#endif diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h new file mode 100644 index 000000000..576da66de --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_METAX_H__ +#define __INFINIOP_AVG_POOL1D_METAX_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(metax) + +#endif // __INFINIOP_AVG_POOL1D_METAX_H__ diff --git a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca new file mode 100644 index 000000000..9b3f15b9a --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca @@ -0,0 +1,170 @@ +#include "../../../devices/metax/metax_common.h" +#include "avg_pool1d_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include + +namespace op::avg_pool1d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +__device__ __forceinline__ Tdata castToOutput(Tcompute val) { + if constexpr (std::is_same_v) { + return __float2half(static_cast(val)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(static_cast(val)); + } else { + return static_cast(val); + } +} + +template +INFINIOP_METAX_KERNEL avgPool1dGlobalKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + size_t x_base = b * x_stride_batch + c * x_stride_channel; + + long long start_w = static_cast(ow * stride) - static_cast(padding); + long long end_w = start_w + static_cast(kernel_size); + long long iw_start = start_w < 0 ? 0 : start_w; + long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; + + Tcompute sum = Tcompute(0); + if (iw_start < iw_end) { + size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; + for (long long iw = iw_start; iw < iw_end; ++iw) { + sum += static_cast(x[x_offset]); + x_offset += x_stride_width; + } + } + + y[y_offset] = castToOutput(sum * inv_kernel); + } +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + Tdata *y, + const Tdata *x, + hcStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < block_size) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA, TCOMPUTE) \ + calculateAvgPool1d( \ + _info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (hcStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half, float); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16, float); + case INFINI_DTYPE_F32: + return CALCULATE(float, float); + case INFINI_DTYPE_F64: + return CALCULATE(double, double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::metax diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h new file mode 100644 index 000000000..9034d7358 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h @@ -0,0 +1,72 @@ +#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ +#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ + +#include + +namespace op::avg_pool1d::moore { + +template +__device__ __forceinline__ Tdata castToOutput(Tcompute val) { + if constexpr (std::is_same_v) { + return __float2half(static_cast(val)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(static_cast(val)); + } else { + return static_cast(val); + } +} + +template +__device__ void avgPool1dKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + size_t total_elements = batch * channels * out_width; + Tcompute inv_kernel = Tcompute(1) / static_cast(kernel_size); + + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_elements; + idx += gridDim.x * blockDim.x) { + + size_t ow = idx % out_width; + size_t temp = idx / out_width; + size_t c = temp % channels; + size_t b = temp / channels; + + size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width; + size_t x_base = b * x_stride_batch + c * x_stride_channel; + + long long start_w = static_cast(ow * stride) - static_cast(padding); + long long end_w = start_w + static_cast(kernel_size); + long long iw_start = start_w < 0 ? 0 : start_w; + long long iw_end = end_w > static_cast(in_width) ? static_cast(in_width) : end_w; + + Tcompute sum = Tcompute(0); + if (iw_start < iw_end) { + size_t x_offset = x_base + static_cast(iw_start) * x_stride_width; + for (long long iw = iw_start; iw < iw_end; ++iw) { + sum += static_cast(x[x_offset]); + x_offset += x_stride_width; + } + } + + y[y_offset] = castToOutput(sum * inv_kernel); + } +} + +} // namespace op::avg_pool1d::moore + +#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h new file mode 100644 index 000000000..604d06012 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__ +#define __INFINIOP_AVG_POOL1D_MOORE_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(moore) + +#endif // __INFINIOP_AVG_POOL1D_MOORE_H__ diff --git a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu new file mode 100644 index 000000000..518d249b9 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu @@ -0,0 +1,135 @@ +#include "../../../devices/moore/moore_common.h" +#include "avg_pool1d_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "avg_pool1d_kernel.h" + +namespace op::avg_pool1d::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel( + Tdata *y, + const Tdata *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + avgPool1dKernel( + y, x, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_stride_batch, y_stride_channel, y_stride_width, + x_stride_batch, x_stride_channel, x_stride_width); +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + Tdata *y, + const Tdata *x, + musaStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < block_size) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA, TCOMPUTE) \ + calculateAvgPool1d(\ + _info,\ + _opaque->internal->maxThreadsPerBlock(),\ + (TDATA *)y,\ + (const TDATA *)x,\ + (musaStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half, float); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16, float); + case INFINI_DTYPE_F32: + return CALCULATE(float, float); + case INFINI_DTYPE_F64: + return CALCULATE(double, double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::moore diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu new file mode 100644 index 000000000..202d4b8e9 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu @@ -0,0 +1,126 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "avg_pool1d_nvidia.cuh" + +template +__global__ void avgPool1dGlobalKernel( + T *y, + const T *x, + size_t batch, + size_t channels, + size_t in_width, + size_t out_width, + size_t kernel_size, + size_t stride, + size_t padding, + ptrdiff_t y_stride_batch, + ptrdiff_t y_stride_channel, + ptrdiff_t y_stride_width, + ptrdiff_t x_stride_batch, + ptrdiff_t x_stride_channel, + ptrdiff_t x_stride_width) { + + avgPool1dKernel( + y, x, + batch, channels, in_width, out_width, + kernel_size, stride, padding, + y_stride_batch, y_stride_channel, y_stride_width, + x_stride_batch, x_stride_channel, x_stride_width); +} + +namespace op::avg_pool1d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t kernel_size, + size_t stride, + size_t padding) { + + auto handle = reinterpret_cast(handle_); + + auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAvgPool1d( + const AvgPool1dInfo &info, + int max_threads_per_block, + T *y, + const T *x, + cudaStream_t stream) { + + size_t total_elements = info.batch * info.channels * info.out_width; + + int block_size = 256; + if (max_threads_per_block > 0 && max_threads_per_block < 256) { + block_size = max_threads_per_block; + } + + size_t grid_size = (total_elements + block_size - 1) / block_size; + if (grid_size > 65535) { + grid_size = 65535; + } + + avgPool1dGlobalKernel<<>>( + y, x, + info.batch, info.channels, info.in_width, info.out_width, + info.kernel_size, info.stride, info.padding, + info.y_stride_batch, info.y_stride_channel, info.y_stride_width, + info.x_stride_batch, info.x_stride_channel, info.x_stride_width); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE(TDATA) \ + calculateAvgPool1d(_info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (cudaStream_t)stream) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return CALCULATE(half); + case INFINI_DTYPE_BF16: + return CALCULATE(cuda_bfloat16); + case INFINI_DTYPE_F32: + return CALCULATE(float); + case INFINI_DTYPE_F64: + return CALCULATE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +#undef CALCULATE + +} // namespace op::avg_pool1d::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh new file mode 100644 index 000000000..629e745d7 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__ +#define __INFINIOP_AVG_POOL1D_CUDA_H__ + +#include "../avg_pool1d.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/avg_pool1d/operator.cc b/src/infiniop/ops/avg_pool1d/operator.cc new file mode 100644 index 000000000..c3696daa1 --- /dev/null +++ b/src/infiniop/ops/avg_pool1d/operator.cc @@ -0,0 +1,225 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/avg_pool1d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/avg_pool1d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/avg_pool1d_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/avg_pool1d_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/avg_pool1d_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/avg_pool1d_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/avg_pool1d_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/avg_pool1d_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor( + infiniopHandle_t handle, + infiniopAvgPool1dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t kernel_size, + size_t stride, + size_t padding) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::avg_pool1d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + kernel_size, \ + stride, \ + padding) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc, + size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__INFINI_C infiniStatus_t infiniopAvgPool1d( + infiniopAvgPool1dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc new file mode 100644 index 000000000..af0ebc623 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc @@ -0,0 +1,99 @@ +#include "cross_entropy_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include +#include + +namespace op::cross_entropy::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + + info.outer_size = target_desc->numel(); + + info.vocab_size = x_desc->shape().back(); + + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info, + T *y, const T *x, const void *target) { + const Tidx *label = reinterpret_cast(target); + +#pragma omp parallel for + for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) { + const T *row = x + i * info->x_stride; + Tidx idx = label[i]; + + if (idx < 0 || static_cast(idx) >= info->vocab_size) { + y[i] = utils::cast(0.f); + continue; + } + + float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1); + + float sum_exp = 0.f; + for (size_t j = 0; j < info->vocab_size; ++j) { + sum_exp += std::exp(utils::cast(row[j]) - max_val); + } + + float log_term = std::log(sum_exp) + max_val; + float target_logit = utils::cast(row[idx]); + y[i] = utils::cast(log_term - target_logit); + } + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info, + T *y, const T *x, const void *target) { + + if (info->target_dtype == INFINI_DTYPE_I32) { + return cross_entropy_kernel(info, y, x, target); + } else if (info->target_dtype == INFINI_DTYPE_I64) { + return cross_entropy_kernel(info, y, x, target); + } + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target); + case INFINI_DTYPE_BF16: + return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target); + case INFINI_DTYPE_F32: + return dispatch_target_type(&_info, (float *)y, (const float *)x, target); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::cross_entropy::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h new file mode 100644 index 000000000..7417d1d81 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_CPU_H__ +#define __CROSS_ENTROPY_CPU_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cross_entropy.h b/src/infiniop/ops/cross_entropy/cross_entropy.h new file mode 100644 index 000000000..b502823db --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cross_entropy.h @@ -0,0 +1,42 @@ +#ifndef CROSS_ENTROPY_H +#define CROSS_ENTROPY_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::cross_entropy::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + CrossEntropyInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, \ + CrossEntropyInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create(infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t target_desc); \ + infiniStatus_t calculate(void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh new file mode 100644 index 000000000..c048c1233 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh @@ -0,0 +1,80 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../reduce/cuda/reduce.cuh" + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = 0.0f; + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset); + } + + static __shared__ Tcompute shared_sum[32]; + int lane = threadIdx.x % warpSize; + int warp = threadIdx.x / warpSize; + + if (lane == 0) { + shared_sum[warp] = thread_sum; + } + __syncthreads(); + + Tcompute block_sum = 0.0f; + if (warp == 0) { + + if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) { + block_sum = shared_sum[lane]; + } + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + block_sum += __shfl_down_sync(0xffffffff, block_sum, offset); + } + } + + if (threadIdx.x == 0) { + Tcompute log_term = logf(block_sum) + max_val; + + Tcompute target_logit = 0.0f; + + if (label >= 0 && static_cast(label) < vocab_size) { + target_logit = static_cast(x[label]); + } else { + + log_term = 0.0f; + } + + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +#endif diff --git a/src/infiniop/ops/cross_entropy/info.h b/src/infiniop/ops/cross_entropy/info.h new file mode 100644 index 000000000..a83afebb8 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/info.h @@ -0,0 +1,17 @@ +#ifndef CROSS_ENTROPY_INFO_H +#define CROSS_ENTROPY_INFO_H +#include "../../../utils.h" +#include "../../tensor.h" +#include + +#include + +struct CrossEntropyInfo { + int dtype; + int target_dtype; + size_t outer_size; + size_t vocab_size; + ptrdiff_t x_stride; +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h new file mode 100644 index 000000000..57bccea91 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_METAX_H__ +#define __CROSS_ENTROPY_METAX_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(metax) + +#endif // __CROSS_ENTROPY_METAX_H__ diff --git a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca new file mode 100644 index 000000000..efd791183 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca @@ -0,0 +1,188 @@ +#include "../../../devices/metax/metax_common.h" +#include "cross_entropy_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include + +namespace { + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = Tcompute(0); + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); + + if (threadIdx.x == 0) { + if (label < 0 || static_cast(label) >= vocab_size) { + y_[row_idx] = static_cast(0.0f); + return; + } + Tcompute log_term = logf(block_sum) + max_val; + Tcompute target_logit = static_cast(x[label]); + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +template +INFINIOP_METAX_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +} // namespace + +namespace op::cross_entropy::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + (void)y_desc; + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, hcStream_t stream) { + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto stream = reinterpret_cast(stream_); + int max_threads = _opaque->internal->maxThreadsPerBlock(); + + if (max_threads >= METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else if (max_threads >= METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else { + CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::metax diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h new file mode 100644 index 000000000..6648b0e32 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h @@ -0,0 +1,53 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +template +__device__ void crossEntropyKernel( + Tdata *y_, + const Tdata *x_, + const void *target_, + size_t outer_size, + size_t vocab_size, + ptrdiff_t x_stride) { + + size_t row_idx = blockIdx.x; + if (row_idx >= outer_size) { + return; + } + + const Tdata *x = x_ + row_idx * x_stride; + const Tidx *target = reinterpret_cast(target_); + + Tidx label = target[row_idx]; + + Tdata max_val_raw = op::common_cuda::reduce_op::max(x, vocab_size); + __shared__ Tcompute max_val_shared; + if (threadIdx.x == 0) { + max_val_shared = static_cast(max_val_raw); + } + __syncthreads(); + + Tcompute max_val = max_val_shared; + + Tcompute thread_sum = Tcompute(0); + for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) { + Tcompute val = static_cast(x[col]); + thread_sum += expf(val - max_val); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum); + + if (threadIdx.x == 0) { + if (label < 0 || static_cast(label) >= vocab_size) { + y_[row_idx] = static_cast(0.0f); + return; + } + Tcompute log_term = logf(block_sum) + max_val; + Tcompute target_logit = static_cast(x[label]); + y_[row_idx] = static_cast(log_term - target_logit); + } +} + +#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h new file mode 100644 index 000000000..454b14617 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_MOORE_H__ +#define __CROSS_ENTROPY_MOORE_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(moore) + +#endif diff --git a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu new file mode 100644 index 000000000..2535679dd --- /dev/null +++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu @@ -0,0 +1,129 @@ +#include "../../../devices/moore/moore_common.h" +#include "cross_entropy_moore.h" + +#include +#include "../../../devices/moore/moore_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "cross_entropy_kernel.h" + +template +INFINIOP_MOORE_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +namespace op::cross_entropy::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + (void)y_desc; + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CrossEntropyInfo info{}; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, musaStream_t stream) { + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>( + (half *)y, (const half *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>( + (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>( + (float *)y, (const float *)x, target, + info.outer_size, info.vocab_size, info.x_stride); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + (void)workspace; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel(y, x, target, _info, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::moore diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu new file mode 100644 index 000000000..0ce3f4984 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu @@ -0,0 +1,107 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "cross_entropy_nvidia.cuh" + +template +INFINIOP_CUDA_KERNEL crossEntropy( + Tdata *y, const Tdata *x, const void *target, + size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) { + + crossEntropyKernel( + y, x, target, outer_size, vocab_size, x_stride); +} + +namespace op::cross_entropy::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + + auto x_dtype = x_desc->dtype(); + auto t_dtype = target_desc->dtype(); + + CrossEntropyInfo info; + info.dtype = x_dtype; + info.target_dtype = t_dtype; + + info.vocab_size = x_desc->shape().back(); + info.outer_size = target_desc->numel(); + info.x_stride = static_cast(info.vocab_size); + + auto internal = reinterpret_cast(handle)->internal(); + + *desc_ptr = new Descriptor( + new Opaque{internal}, + info, 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, const void *target, + const CrossEntropyInfo &info, cudaStream_t stream) { + + dim3 grid(static_cast(info.outer_size), 1, 1); + + if (info.target_dtype == INFINI_DTYPE_I64) { + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } + } else if (info.target_dtype == INFINI_DTYPE_I32) { + + if (info.dtype == INFINI_DTYPE_F16) { + crossEntropy + <<>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_BF16) { + crossEntropy + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } else if (info.dtype == INFINI_DTYPE_F32) { + crossEntropy + <<>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride); + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + int max_threads = _opaque->internal->maxThreadsPerBlock(); + + if (max_threads >= 1024) { + CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream)); + } else if (max_threads >= 512) { + CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream)); + } else { + CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh new file mode 100644 index 000000000..441e5b8d8 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_NVIDIA_H__ +#define __CROSS_ENTROPY_NVIDIA_H__ + +#include "../cross_entropy.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/cross_entropy/operator.cc b/src/infiniop/ops/cross_entropy/operator.cc new file mode 100644 index 000000000..75f35fcb7 --- /dev/null +++ b/src/infiniop/ops/cross_entropy/operator.cc @@ -0,0 +1,174 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_cpu.h" +#endif + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/cross_entropy_nvidia.cuh" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/cross_entropy_moore.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cross_entropy_metax.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, x_desc, target_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize( + infiniopCrossEntropyDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopCrossEntropy( + infiniopCrossEntropyDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, target, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor( + infiniopCrossEntropyDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + DESTROY(INFINI_DEVICE_HYGON, nvidia) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc new file mode 100644 index 000000000..ff8ebe395 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc @@ -0,0 +1,68 @@ +#include +#include + +#include "equal_cpu.h" + +namespace op::equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + if (compute_dtype != b_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::cpu diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h new file mode 100644 index 000000000..fd811f4b0 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.h @@ -0,0 +1,28 @@ +#ifndef __EQUAL_CPU_H__ +#define __EQUAL_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(equal, cpu) + +namespace op::equal::cpu { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + bool operator()(const Tin0 &a, const Tin1 &b) { + if constexpr (std::is_same_v) { + return a == b; + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::cpu + +#endif diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh new file mode 100644 index 000000000..11ad5981e --- /dev/null +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __EQUAL_CUDA_H__ +#define __EQUAL_CUDA_H__ + +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif +#include + +namespace op::equal::cuda { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + static_assert(!std::is_same_v, "half2 is not supported for mixed output dtype"); + } else if constexpr (std::is_same_v) { + return static_cast(__heq(a, b)); + } else { + return static_cast(a == b); + } + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::cuda + +#endif diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h new file mode 100644 index 000000000..6e4cd64b9 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_METAX_API_H__ +#define __EQUAL_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(equal, metax) + +#endif // __EQUAL_METAX_API_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca new file mode 100644 index 000000000..265e5b5a6 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.maca @@ -0,0 +1,69 @@ +#include "equal_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::equal::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/moore/equal_moore.h b/src/infiniop/ops/equal/moore/equal_moore.h new file mode 100644 index 000000000..2fed1bb40 --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_MOORE_API_H__ +#define __EQUAL_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(equal, moore) + +#endif // __EQUAL_MOORE_API_H__ diff --git a/src/infiniop/ops/equal/moore/equal_moore.mu b/src/infiniop/ops/equal/moore/equal_moore.mu new file mode 100644 index 000000000..d0eb8395d --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore.mu @@ -0,0 +1,140 @@ +#include "equal_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "equal_moore_kernel.h" + +namespace op::equal::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + if (!info.isOutputContiguous()) { + return false; + } + const bool *input_contiguous = info.getInputContiguous(); + const bool *input_broadcasted = info.getInputBroadcasted(); + for (size_t i = 0; i < 2; ++i) { + if (!input_contiguous[i] || input_broadcasted[i]) { + return false; + } + } + return true; +} + +template +INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) { + const auto op = op::equal::moore::EqualOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + output[idx] = op.template operator()(a[idx], b[idx]); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + equal_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1])); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (can_use_contiguous_fast_path(_info)) { + size_t numel = _info.getOutputSize(); + switch (_dtype) { + case INFINI_DTYPE_F16: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_I32: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_I64: + return launch_fast_path(numel, output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(numel, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::equal::moore diff --git a/src/infiniop/ops/equal/moore/equal_moore_kernel.h b/src/infiniop/ops/equal/moore/equal_moore_kernel.h new file mode 100644 index 000000000..a4e32880b --- /dev/null +++ b/src/infiniop/ops/equal/moore/equal_moore_kernel.h @@ -0,0 +1,30 @@ +#ifndef __EQUAL_MOORE_KERNEL_H__ +#define __EQUAL_MOORE_KERNEL_H__ + +#include + +namespace op::equal::moore { + +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return __half2float(a) == __half2float(b); + } else if constexpr (std::is_same_v) { + return __bfloat162float(a) == __bfloat162float(b); + } else { + return a == b; + } + } else { + return false; + } + } +} EqualOp; + +} // namespace op::equal::moore + +#endif // __EQUAL_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu new file mode 100644 index 000000000..5bdf92e6c --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu @@ -0,0 +1,137 @@ +#include +#include +#include + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "equal_nvidia.cuh" + +namespace { + +template +INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + op::equal::cuda::EqualOp op{}; + for (; idx < n; idx += stride) { + output[idx] = op.template operator()(a[idx], b[idx]); + } +} + +template +infiniStatus_t launchFastEqualKernel(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + constexpr int block = 256; + int grid = static_cast((numel + block - 1) / block); + grid = std::min(grid, 65535); + auto cuda_stream = reinterpret_cast(stream); + FastEqualKernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1])); + auto err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +namespace op::equal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + auto compute_dtype = a_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + bool fast_path = _info.isOutputContiguous(); + if (fast_path) { + const bool *input_contiguous = _info.getInputContiguous(); + const bool *input_broadcasted = _info.getInputBroadcasted(); + for (size_t i = 0; i < 2; ++i) { + fast_path &= input_contiguous[i] && !input_broadcasted[i]; + } + } + + if (fast_path) { + size_t numel = _info.getOutputSize(); + switch (_dtype) { + case INFINI_DTYPE_F16: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_F32: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_I32: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_I64: + return launchFastEqualKernel(numel, output, inputs, stream); + case INFINI_DTYPE_F64: + return launchFastEqualKernel(numel, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh new file mode 100644 index 000000000..96932dc3d --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EQUAL_CUDA_API_H__ +#define __EQUAL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(equal, nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc new file mode 100644 index 000000000..80da07e01 --- /dev/null +++ b/src/infiniop/ops/equal/operator.cc @@ -0,0 +1,201 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/equal_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/equal_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/equal_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/equal_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..f47198580 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,91 @@ +#include "hardswish_cpu.h" + +#include + +namespace op::hardswish::cpu { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info, + void *output, + const std::vector &inputs) { + const T *in = reinterpret_cast(inputs[0]); + T *out = reinterpret_cast(output); + const ptrdiff_t size = static_cast(info.getOutputSize()); + +#pragma omp parallel for if (size > 1024) + for (ptrdiff_t i = 0; i < size; ++i) { + out[i] = HardSwishOp{}(in[i]); + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F16: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F32: + return launch_contiguous_cpu(_info, output, inputs); + case INFINI_DTYPE_F64: + return launch_contiguous_cpu(_info, output, inputs); + default: + break; + } + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..b853663aa --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,50 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +#include +#include + +namespace op::hardswish::cpu { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + const float x_f = utils::cast(x); + const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f); + const float result = x_f * clamped * (1.0f / 6.0f); + return utils::cast(result); + } +} HardSwishOp; + +typedef struct HardSwishContiguousOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + + T three = static_cast(3); + T zero = static_cast(0); + T six = static_cast(6); + + T scale = static_cast(0.16666667f); + + T val = x + three; + + val = std::max(zero, val); + val = std::min(six, val); + + return x * val * scale; + } +} HardSwishContiguousOp; + +} // namespace op::hardswish::cpu + +#endif diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..21b6a5f8d --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,86 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif + +namespace op::hardswish::cuda { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + + if constexpr (std::is_same_v) { + + const half2 three = __float2half2_rn(3.0f); + const half2 scale = __float2half2_rn(0.16666667f); + + half2 val = __hadd2(x, three); + +#if defined(ENABLE_ILUVATAR_API) + + float2 val_f = __half22float2(val); + val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f); + val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f); + val = __floats2half2_rn(val_f.x, val_f.y); +#else + + const half2 zero = __float2half2_rn(0.0f); + const half2 six = __float2half2_rn(6.0f); + +#if __CUDA_ARCH__ >= 800 + + val = __hmin2(__hmax2(val, zero), six); +#else + + val = __hmax2(val, zero); + val = __hmin2(val, six); +#endif +#endif + + return __hmul2(__hmul2(x, val), scale); + + } + + else if constexpr (std::is_same_v) { + + const float x_f = __bfloat162float(x); + + const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2bfloat16(x_f * val * 0.16666667f); + + } + + else if constexpr (std::is_same_v) { + const float x_f = __half2float(x); + const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2half(x_f * val * 0.16666667f); + + } + + else if constexpr (std::is_same_v) { + + const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); + return x * val * 0.16666667f; + + } + + else if constexpr (std::is_same_v) { + const double val = fmin(fmax(x + 3.0, 0.0), 6.0); + return x * val * (1.0 / 6.0); + } + } +} HardSwishOp; + +} // namespace op::hardswish::cuda + +#endif diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..16b131aa9 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_METAX_API_H__ +#define __HARDSWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..fc57a9b20 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,58 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.h b/src/infiniop/ops/hardswish/moore/hardswish_moore.h new file mode 100644 index 000000000..e5861a158 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_MOORE_API_H__ +#define __HARDSWISH_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, moore) + +#endif // __HARDSWISH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore.mu b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu new file mode 100644 index 000000000..3a1290b35 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore.mu @@ -0,0 +1,118 @@ +#include "hardswish_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "hardswish_moore_kernel.h" + +namespace op::hardswish::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && + info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { + const auto op = op::hardswish::moore::HardSwishOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + out[idx] = op(in[idx]); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + hardswish_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0])); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hardswish::moore diff --git a/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h new file mode 100644 index 000000000..60e3dbc60 --- /dev/null +++ b/src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h @@ -0,0 +1,39 @@ +#ifndef __HARDSWISH_MOORE_KERNEL_H__ +#define __HARDSWISH_MOORE_KERNEL_H__ + +#include +#include + +namespace op::hardswish::moore { + +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float x_f = __half2float(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2half(x_f * val * 0.16666667f); + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return __float2bfloat16_rn(x_f * val * 0.16666667f); + } else if constexpr (std::is_same_v) { + float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f); + return x * val * 0.16666667f; + } else if constexpr (std::is_same_v) { + double val = fmin(fmax(x + 3.0, 0.0), 6.0); + return x * val * (1.0 / 6.0); + } else { + float x_f = static_cast(x); + float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f); + return static_cast(x_f * val * 0.16666667f); + } + } +} HardSwishOp; + +} // namespace op::hardswish::moore + +#endif // __HARDSWISH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..f7736a7fd --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,115 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +#include + +namespace op::hardswish::nvidia { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) { + const auto op = op::hardswish::cuda::HardSwishOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < numel) { + out[idx] = op(in[idx]); + idx += blockDim.x * gridDim.x; + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int BLOCK_SIZE = 256; + int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + grid = std::min(grid, 65535); + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs[0]); + auto cuda_stream = reinterpret_cast(stream); + + hardswish_contiguous_kernel<<>>(numel, out_ptr, in_ptr); + cudaError_t err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..e544591dc --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..ddce97f16 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/hardswish_moore.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc new file mode 100644 index 000000000..1bd276308 --- /dev/null +++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc @@ -0,0 +1,124 @@ +#include "hardtanh_cpu.h" + +#include + +namespace op::hardtanh::cpu { + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor( + dtype, + info_result.take(), + 0, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +template +static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info, + void *output, + const std::vector &inputs, + float min_val, + float max_val) { + if (inputs.empty()) { + return INFINI_STATUS_BAD_PARAM; + } + + T *out = reinterpret_cast(output); + const T *in = reinterpret_cast(inputs[0]); + const auto ndim = info.getNdim(); + const auto *output_shape = info.getOutputShape(); + const auto *output_strides = info.getOutputStrides(); + const auto *input_shape = info.getInputShape(0); + const auto *input_strides = info.getInputStrides(0); + const auto *input_contiguous = info.getInputContiguous(); + ptrdiff_t output_size = info.getOutputSize(); + +#pragma omp parallel for if (output_size > 1024) + for (ptrdiff_t i = 0; i < output_size; ++i) { + const size_t out_idx = info.isOutputContiguous() + ? static_cast(i) + : op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides); + const size_t in_idx = input_contiguous[0] + ? static_cast(i) + : op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides); + + if constexpr (std::is_same_v || std::is_same_v) { + float value = utils::cast(in[in_idx]); + float clamped = HardTanhOp{}(value, min_val, max_val); + out[out_idx] = utils::cast(clamped); + } else { + out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val); + } + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + (void)workspace; + (void)workspace_size; + (void)stream; + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launchCpuHardTanh(_info, output, inputs, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::hardtanh::cpu diff --git a/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h new file mode 100644 index 000000000..09bfb340c --- /dev/null +++ b/src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h @@ -0,0 +1,63 @@ +#ifndef __HARDTANH_CPU_H__ +#define __HARDTANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +namespace op::hardtanh::cpu { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x, float min_val, float max_val) const { + T low = static_cast(min_val); + T high = static_cast(max_val); + T val = x < low ? low : x; + return val > high ? high : val; + } +} HardTanhOp; + +} // namespace op::hardtanh::cpu + +#endif diff --git a/src/infiniop/ops/hardtanh/cuda/kernel.cuh b/src/infiniop/ops/hardtanh/cuda/kernel.cuh new file mode 100644 index 000000000..28987f82c --- /dev/null +++ b/src/infiniop/ops/hardtanh/cuda/kernel.cuh @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_CUDA_H__ +#define __HARDTANH_CUDA_H__ + +#if defined(__MACACC__) +#include +#include +#else +#include +#include +#endif +#include + +namespace op::hardtanh::cuda { + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { + if constexpr (std::is_same_v) { + + float2 x_f2 = __half22float2(x); + x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x)); + x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y)); + return __float22half2_rn(x_f2); + + } else if constexpr (std::is_same_v) { + + float x_f = __bfloat162float(x); + return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f))); + + } else if constexpr (std::is_same_v) { + + float x_f = __half2float(x); + return __float2half(fminf(max_val, fmaxf(min_val, x_f))); + + } else if constexpr (std::is_same_v) { + + return fminf(max_val, fmaxf(min_val, x)); + + } else if constexpr (std::is_same_v) { + + return fmin((double)max_val, fmax((double)min_val, x)); + } + } +} HardTanhOp; + +} // namespace op::hardtanh::cuda + +#endif diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h new file mode 100644 index 000000000..182157116 --- /dev/null +++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.h @@ -0,0 +1,48 @@ +#ifndef __HARDTANH_METAX_API_H__ +#define __HARDTANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +namespace op::hardtanh::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; +}; + +} // namespace op::hardtanh::metax + +#endif // __HARDTANH_METAX_API_H__ diff --git a/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca new file mode 100644 index 000000000..596316e23 --- /dev/null +++ b/src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca @@ -0,0 +1,95 @@ +#include "hardtanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardtanh::metax { + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardTanhOp, half>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardTanhOp, float>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardTanhOp, double>( + _info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::hardtanh::metax diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h new file mode 100644 index 000000000..470790d52 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.h @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_MOORE_API_H__ +#define __HARDTANH_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +namespace op::hardtanh::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::moore::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +} // namespace op::hardtanh::moore + +#endif // __HARDTANH_MOORE_API_H__ diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu new file mode 100644 index 000000000..40e3dbe41 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu @@ -0,0 +1,158 @@ +#include "hardtanh_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "hardtanh_moore_kernel.h" + +namespace op::hardtanh::moore { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && + info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel, + T *out, + const T *in, + float min_val, + float max_val) { + const auto op = op::hardtanh::moore::HardTanhOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (; idx < numel; idx += stride) { + out[idx] = op(in[idx], min_val, max_val); + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream, + float min_val, + float max_val) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int kBlockSize = 256; + int grid = static_cast((numel + kBlockSize - 1) / kBlockSize); + if (grid > 65535) { + grid = 65535; + } + + auto musa_stream = reinterpret_cast(stream); + hardtanh_contiguous_kernel<<>>( + numel, + reinterpret_cast(output), + reinterpret_cast(inputs[0]), + min_val, + max_val); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::moore::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::hardtanh::moore diff --git a/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h new file mode 100644 index 000000000..db0a3c024 --- /dev/null +++ b/src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h @@ -0,0 +1,34 @@ +#ifndef __HARDTANH_MOORE_KERNEL_H__ +#define __HARDTANH_MOORE_KERNEL_H__ + +#include +#include + +namespace op::hardtanh::moore { + +typedef struct HardTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const { + if constexpr (std::is_same_v) { + float x_f = __half2float(x); + return __float2half(fminf(max_val, fmaxf(min_val, x_f))); + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f))); + } else if constexpr (std::is_same_v) { + return fminf(max_val, fmaxf(min_val, x)); + } else if constexpr (std::is_same_v) { + return fmin((double)max_val, fmax((double)min_val, x)); + } else { + float x_f = static_cast(x); + return static_cast(fminf(max_val, fmaxf(min_val, x_f))); + } + } +} HardTanhOp; + +} // namespace op::hardtanh::moore + +#endif // __HARDTANH_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu new file mode 100644 index 000000000..31ba489ab --- /dev/null +++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu @@ -0,0 +1,150 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardtanh_nvidia.cuh" + +#include + +namespace op::hardtanh::nvidia { +namespace { + +inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) { + return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0]; +} + +template +__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) { + const auto op = op::hardtanh::cuda::HardTanhOp{}; + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < numel) { + out[idx] = op(in[idx], min_val, max_val); + idx += blockDim.x * gridDim.x; + } +} + +template +infiniStatus_t launch_fast_path(size_t numel, + void *output, + const std::vector &inputs, + void *stream, + float min_val, + float max_val) { + if (numel == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int BLOCK_SIZE = 256; + int grid = static_cast((numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + grid = std::min(grid, 65535); + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs[0]); + auto cuda_stream = reinterpret_cast(stream); + + hardtanh_contiguous_kernel<<>>(numel, out_ptr, in_ptr, min_val, max_val); + cudaError_t err = cudaGetLastError(); + return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace + +Descriptor::Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _min_val(min_val), + _max_val(max_val) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_SAME_SHAPE(output_shape, input_shape); + + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + dtype, + std::move(info), + device_impl_result.take(), + workspace_size, + handle->device, + handle->device_id, + min_val, + max_val); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const bool fast_path = can_use_contiguous_fast_path(_info); + if (fast_path) { + switch (_dtype) { + case INFINI_DTYPE_BF16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return launch_fast_path(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val); + default: + break; + } + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardtanh::nvidia diff --git a/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh new file mode 100644 index 000000000..ebd27d80e --- /dev/null +++ b/src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh @@ -0,0 +1,51 @@ +#ifndef __HARDTANH_CUDA_API_H__ +#define __HARDTANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +namespace op::hardtanh::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _min_val; + float _max_val; + + Descriptor(infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id, + float min_val, + float max_val); + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float min_val, + float max_val); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float minVal() const { return _min_val; } + float maxVal() const { return _max_val; } +}; + +} // namespace op::hardtanh::nvidia + +#endif diff --git a/src/infiniop/ops/hardtanh/operator.cc b/src/infiniop/ops/hardtanh/operator.cc new file mode 100644 index 000000000..f3c782224 --- /dev/null +++ b/src/infiniop/ops/hardtanh/operator.cc @@ -0,0 +1,161 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardtanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardtanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardtanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardtanh_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/hardtanh_moore.h" +#endif + +__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor( + infiniopHandle_t handle, + infiniopHardTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + float min_val, + float max_val) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardtanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}, \ + min_val, \ + max_val) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C infiniStatus_t infiniopHardTanh( + infiniopHardTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t +infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/utils/custom_types.h b/src/utils/custom_types.h index 05a5c2fca..23be702ff 100644 --- a/src/utils/custom_types.h +++ b/src/utils/custom_types.h @@ -13,6 +13,22 @@ struct CustomBFloat16 { }; typedef struct CustomBFloat16 bf16_t; +inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { + return lhs._v == rhs._v; +} + +inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) { + return !(lhs == rhs); +} + +inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { + return lhs._v == rhs._v; +} + +inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) { + return !(lhs == rhs); +} + float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); diff --git a/test/infinicore/ops/avg_pool1d.py b/test/infinicore/ops/avg_pool1d.py index 5a0318571..539951628 100644 --- a/test/infinicore/ops/avg_pool1d.py +++ b/test/infinicore/ops/avg_pool1d.py @@ -74,9 +74,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.avg_pool1d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.avg_pool1d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.avg_pool1d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/cross_entropy.py b/test/infinicore/ops/cross_entropy.py index e71a30567..269216bc7 100644 --- a/test/infinicore/ops/cross_entropy.py +++ b/test/infinicore/ops/cross_entropy.py @@ -11,6 +11,8 @@ # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None) # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean') +# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。 +# 仍然保留原始配置,后续实现这些特性时只需放开过滤条件即可。 _TEST_CASES_DATA = [ ((4, 5), (4,), None, False, None), ((8, 10), (8,), None, True, -1), @@ -20,6 +22,9 @@ ((2, 2), (2,), None, True, -100), ] +_SUPPORT_WEIGHT = False +_SUPPORT_IGNORE_INDEX = False + _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, @@ -40,6 +45,11 @@ def parse_test_cases(): ) in _TEST_CASES_DATA: for dtype in _TENSOR_DTYPES: tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + if weight_present and not _SUPPORT_WEIGHT: + continue + if ignore_index is not None and not _SUPPORT_IGNORE_INDEX: + continue + logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype) target = TensorSpec.from_tensor( target_shape, @@ -51,7 +61,7 @@ def parse_test_cases(): ) inputs = [logits, target] - kwargs = {} + kwargs = {"reduction": "none"} if weight_present: weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype) inputs.append(weight_spec) @@ -84,9 +94,10 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.cross_entropy(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.cross_entropy(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation.""" + out = kwargs.pop("out", None) + return infinicore.cross_entropy(*args, out=out, **kwargs) def main(): diff --git a/test/infinicore/ops/equal.py b/test/infinicore/ops/equal.py index 10aae3fcb..fd5c37261 100644 --- a/test/infinicore/ops/equal.py +++ b/test/infinicore/ops/equal.py @@ -74,8 +74,11 @@ def parse_test_cases(): ) ) - # in-place a - if a_supports_inplace: + # Equal 结果为 bool,无法安全复用浮点/整型输入作为输出缓冲区。 + # 只有当输入 dtype 本身为 bool 时才允许 inplace,这里提前留出开关。 + allow_input_inplace = dtype == infinicore.bool + + if allow_input_inplace and a_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -87,8 +90,7 @@ def parse_test_cases(): ) ) - # in-place b - if b_supports_inplace: + if allow_input_inplace and b_supports_inplace: test_cases.append( TestCase( inputs=[a_spec, b_spec], @@ -115,9 +117,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.eq(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.eq(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.equal(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardswish.py b/test/infinicore/ops/hardswish.py index 9f31cdc62..5ab38d594 100644 --- a/test/infinicore/ops/hardswish.py +++ b/test/infinicore/ops/hardswish.py @@ -70,9 +70,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardswish(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.hardswish(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.hardswish(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/hardtanh.py b/test/infinicore/ops/hardtanh.py index 6861e464e..a88ea6c8d 100644 --- a/test/infinicore/ops/hardtanh.py +++ b/test/infinicore/ops/hardtanh.py @@ -17,7 +17,6 @@ _TEST_CASES_DATA = [ ((13, 4), None, -1.0, 1.0), - ((13, 4), (10, 1), -0.5, 0.5), ((8, 8, 8), None, -2.0, 2.0), ] @@ -87,9 +86,11 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.hardtanh(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.hardtanh(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation.""" + import infinicore.nn.functional as F + + return F.hardtanh(*args, **kwargs) def main(): diff --git a/test/infiniop/avg_pool1d.py b/test/infiniop/avg_pool1d.py new file mode 100644 index 000000000..dd9e771c0 --- /dev/null +++ b/test/infiniop/avg_pool1d.py @@ -0,0 +1,183 @@ +import ctypes +from ctypes import c_uint64 + +import torch + +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES = [ + # input_shape, x_stride, y_stride, kernel_size, stride, padding + ((2, 3, 16), None, None, 3, None, 0), + ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2), + ((2, 1, 32), None, (32, 16, 1), 2, 2, 0), + ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1), + ((4, 6, 31), None, None, 4, 2, 1), + ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def _effective_stride(stride, kernel_size): + if stride in (None, 0): + return kernel_size + return stride + + +def _compute_output_shape(input_shape, kernel_size, stride, padding): + stride = _effective_stride(stride, kernel_size) + width = input_shape[2] + out_width = (width + 2 * padding - kernel_size) // stride + 1 + return (input_shape[0], input_shape[1], out_width) + + +def avg_pool1d_ref(x, kernel_size, stride, padding): + stride = _effective_stride(stride, kernel_size) + out = torch.nn.functional.avg_pool1d( + x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding + ) + return out.to(x.dtype) + + +def test( + handle, + device, + input_shape, + x_stride, + y_stride, + kernel_size, + stride, + padding, + dtype=InfiniDtype.F16, + sync=None, +): + stride_value = _effective_stride(stride, kernel_size) + out_shape = _compute_output_shape( + input_shape, kernel_size, stride_value, padding + ) + print( + f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, " + f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, " + f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(input_shape, x_stride, dtype, device) + y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros") + + ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPool1dDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + kernel_size, + stride_value, + padding, + ) + ) + + # Invalidate descriptors in tensors after creation to make sure kernels read from arguments + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_avg_pool1d(): + check_error( + LIBINFINIOP.infiniopAvgPool1d( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_avg_pool1d() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation( + "PyTorch", + lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_avg_pool1d(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") + diff --git a/test/infiniop/cross_entropy.py b/test/infiniop/cross_entropy.py new file mode 100644 index 000000000..987f2d11a --- /dev/null +++ b/test/infiniop/cross_entropy.py @@ -0,0 +1,106 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ------------------------------------------------------------ +# 用例配置 +# ------------------------------------------------------------ +_TEST_CASES_ = [ + ((2, 4, 10), None, None), # logits shape, x_stride, y_stride + ((1, 128, 32000), None, None), + ((4, 512, 1000), None, None), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, +} + +# ------------------------------------------------------------ +# PyTorch 参考实现 +# ------------------------------------------------------------ +def cross_entropy_ref(logits, target): + vocab = logits.shape[-1] + logits_flat = logits.reshape(-1, vocab).float() + target_flat = target.reshape(-1).long() + loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none") + return loss.view(target.shape).to(logits.dtype) + + +def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None): + logits_shape = shape + label_shape = shape[:-1] + vocab = shape[-1] + + print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}") + + x = TestTensor(logits_shape, x_stride, dtype, device) + target = TestTensor(label_shape, None, InfiniDtype.I64, device) + + # 生成有效标签 + tgt = target.torch_tensor() + tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device)) + target.actual_tensor().copy_(tgt) + + reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor()) + y = TestTensor(label_shape, y_stride, dtype, device) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor + ) + ) + + for tensor in [x, y, target]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, x.device) + + def run(): + check_error( + LIBINFINIOP.infiniopCrossEntropy( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + target.data(), + None, + ) + ) + + run() + if sync: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol) + + check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py new file mode 100644 index 000000000..e333b94b3 --- /dev/null +++ b/test/infiniop/equal.py @@ -0,0 +1,181 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool,内存大小不同) +class Inplace(Enum): + OUT_OF_PLACE = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 测试的输入数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] + +# 容差设置 (对于 Bool 比较,通常要求完全匹配) +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# PyTorch 标准实现 +def equal_func(c, a, b): + torch.eq(a, b, out=c) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 输入 Tensor 使用指定的 dtype (如 float16) + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + # [关键修改] 输出 Tensor 强制使用 Bool 类型 + # 注意:这里 c_stride 如果是按字节计算的,对于 Bool 类型通常是 1 byte + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device) + + if c.is_broadcast(): + return + + print( + f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL" + ) + + # 运行 PyTorch 对照组 + equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + # [关键修改] 调用 Equal 的 Create 函数 + check_error( + LIBINFINIOP.infiniopCreateEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # Output (Bool) + a.descriptor, # Input A + b.descriptor, # Input B + ) + ) + + # Invalidate descriptors + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + # 使用 Bool 类型的容差 (实际上就是全等) + atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL) + + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # 验证结果 + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..f805b8aad --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,171 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# 复用相同的测试用例配置,因为 HardSwish 也是逐元素操作 +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}" + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + new_output = torch.nn.functional.hardswish(input.torch_tensor()) + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + check_error( + LIBINFINIOP.infiniopCreateHardSwishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardSwishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardSwish( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/hardtanh.py b/test/infiniop/hardtanh.py new file mode 100644 index 000000000..573ba9485 --- /dev/null +++ b/test/infiniop/hardtanh.py @@ -0,0 +1,169 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((16, 5632), None, None), + ((4, 4, 5632), None, None), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# HardTanh 特有的参数测试组合 (min_val, max_val) +_PARAM_CASES = [ + (-1.0, 1.0), + (0.0, 6.0), # 类似于 ReLU6 + (-2.5, 2.5), +] + +# 组合所有测试用例:shape + inplace + params +_TEST_CASES = [ + test_case + (inplace_item, p_min, p_max) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE + for p_min, p_max in _PARAM_CASES +] + +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + min_val=-1.0, + max_val=1.0, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]" + ) + + # 计算 PyTorch 真值 + new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val) + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + + check_error( + LIBINFINIOP.infiniopCreateHardTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + c_float(min_val), + c_float(max_val), + ) + ) + + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardtanh(): + check_error( + LIBINFINIOP.infiniopHardTanh( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_hardtanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + if PROFILE: + profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mHardTanh Test passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 275689e78..8aeba0100 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -54,6 +54,54 @@ def add_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def equal_(lib): + # ========================================================= + # 1. 注册 Create 函数 + # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc) + # ========================================================= + lib.infiniopCreateEqualDescriptor.restype = c_int32 + lib.infiniopCreateEqualDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出) + infiniopTensorDescriptor_t, # output (c) + infiniopTensorDescriptor_t, # input_a + infiniopTensorDescriptor_t, # input_b + ] + + # ========================================================= + # 2. 注册 GetWorkspaceSize 函数 + # C函数签名: (desc, &size) + # ========================================================= + lib.infiniopGetEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # ========================================================= + # 3. 注册 Execute (计算) 函数 + # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream) + # ========================================================= + lib.infiniopEqual.restype = c_int32 + lib.infiniopEqual.argtypes = [ + infiniopOperatorDescriptor_t, # desc + c_void_p, # workspace ptr + c_size_t, # workspace size + c_void_p, # output data ptr + c_void_p, # input a data ptr + c_void_p, # input b data ptr + c_void_p, # stream + ] + + # ========================================================= + # 4. 注册 Destroy 函数 + # C函数签名: (desc) + # ========================================================= + lib.infiniopDestroyEqualDescriptor.restype = c_int32 + lib.infiniopDestroyEqualDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def attention_(lib): @@ -162,6 +210,40 @@ def clip_(lib): ] +@OpRegister.operator +def cross_entropy_(lib): + lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCrossEntropy.restype = c_int32 + lib.infiniopCrossEntropy.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def logsoftmax_(lib): lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32 @@ -909,6 +991,112 @@ def silu_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def hardtanh_(lib): + # 1. Create Descriptor - 注意增加了两个 c_float 参数 + lib.infiniopCreateHardTanhDescriptor.restype = c_int32 + lib.infiniopCreateHardTanhDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t), # desc_ptr + infiniopTensorDescriptor_t, # output + infiniopTensorDescriptor_t, # input + c_float, # min_val + c_float, # max_val + ] + + # 2. Get Workspace Size + lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetHardTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # desc + POINTER(c_size_t), # size + ] + + # 3. Execute Operator + lib.infiniopHardTanh.restype = c_int32 + lib.infiniopHardTanh.argtypes = [ + infiniopOperatorDescriptor_t, # desc + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + # 4. Destroy Descriptor + lib.infiniopDestroyHardTanhDescriptor.restype = c_int32 + lib.infiniopDestroyHardTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, # desc + ] + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardSwishDescriptor.restype = c_int32 + lib.infiniopCreateHardSwishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardSwishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHardSwish.restype = c_int32 + lib.infiniopHardSwish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHardSwishDescriptor.restype = c_int32 + lib.infiniopDestroyHardSwishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def avg_pool1d_(lib): + # 1. Create 函数 + # C签名: (handle, *desc, y, x, kernel_size, stride, padding) + lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32 + lib.infiniopCreateAvgPool1dDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # y_desc (Output) + infiniopTensorDescriptor_t, # x_desc (Input) + c_size_t, # kernel_size + c_size_t, # stride + c_size_t, # padding + ] + + # 2. GetWorkspaceSize 函数 + lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # 3. Execute 函数 + lib.infiniopAvgPool1d.restype = c_int32 + lib.infiniopAvgPool1d.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # y (output pointer) + c_void_p, # x (input pointer) + c_void_p, # stream + ] + + # 4. Destroy 函数 + lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def layer_norm_(lib): diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index ec8763a4e..b690e74d4 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -83,8 +83,12 @@ def __init__( InfiniDtype.BYTE, InfiniDtype.BOOL, ]: - randint_low = -2000000000 if randint_low is None else randint_low - randint_high = 2000000000 if randint_high is None else randint_high + if dt == InfiniDtype.BOOL: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2 if randint_high is None else randint_high + else: + randint_low = -2000000000 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high self._torch_tensor = torch.randint( randint_low, randint_high, From 0391d018231cd636f3599937697bc03746700219 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Mon, 9 Mar 2026 07:10:18 +0000 Subject: [PATCH 6/6] issue/1031 fix trailing whitespace --- include/infinicore/ops/adaptive_max_pool1d.hpp | 2 +- include/infinicore/ops/baddbmm.hpp | 2 +- include/infinicore/ops/bilinear.hpp | 2 +- include/infinicore/ops/cross_entropy.hpp | 2 +- include/infiniop/ops/adaptive_max_pool1d.h | 2 +- include/infiniop/ops/asinh.h | 2 +- include/infiniop/ops/avg_pool1d.h | 2 +- include/infiniop/ops/cross_entropy.h | 2 +- include/infiniop/ops/equal.h | 2 +- include/infiniop/ops/fmod.h | 2 +- include/infiniop/ops/hardswish.h | 2 +- include/infiniop/ops/hardtanh.h | 2 +- src/infinicore/ops/baddbmm/baddbmm.cc | 2 +- src/infinicore/ops/bilinear/bilinear.cc | 2 +- src/infinicore/ops/cross_entropy/cross_entropy.cc | 2 +- src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc | 2 +- src/infinicore/ops/fmod/fmod_infiniop.cc | 2 +- src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp | 2 +- src/infinicore/pybind11/ops/baddbmm.hpp | 2 +- src/infinicore/pybind11/ops/bilinear.hpp | 2 +- src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h | 2 +- .../ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc | 2 +- .../ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h | 2 +- src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh | 2 +- src/infiniop/ops/adaptive_max_pool1d/info.h | 2 +- .../ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh | 2 +- .../adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca | 2 +- .../ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu | 2 +- .../adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu | 2 +- .../adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh | 2 +- src/infiniop/ops/adaptive_max_pool1d/operator.cc | 2 +- src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 2 +- src/infiniop/ops/asinh/cpu/asinh_cpu.h | 2 +- src/infiniop/ops/asinh/cuda/kernel.cuh | 2 +- src/infiniop/ops/asinh/metax/asinh_metax.h | 2 +- src/infiniop/ops/asinh/moore/asinh_moore.h | 2 +- src/infiniop/ops/asinh/moore/asinh_moore.mu | 2 +- src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 2 +- src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh | 2 +- src/infiniop/ops/asinh/operator.cc | 2 +- src/infiniop/ops/avg_pool1d/avg_pool1d.h | 2 +- src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h | 2 +- src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu | 2 +- src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh | 2 +- src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc | 2 +- src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h | 2 +- src/infiniop/ops/cross_entropy/cross_entropy.h | 2 +- src/infiniop/ops/cross_entropy/info.h | 2 +- src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu | 2 +- src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh | 2 +- src/infiniop/ops/equal/nvidia/equal_nvidia.cuh | 2 +- src/infiniop/ops/fmod/cpu/fmod_cpu.h | 2 +- src/infiniop/ops/fmod/cuda/kernel.cuh | 2 +- src/infiniop/ops/fmod/moore/fmod_moore.h | 2 +- src/infiniop/ops/fmod/moore/fmod_moore.mu | 2 +- src/infiniop/ops/fmod/nvidia/fmod_nvidia.cu | 2 +- src/infiniop/ops/fmod/nvidia/fmod_nvidia.cuh | 2 +- src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh | 2 +- test/infiniop/hardswish.py | 2 +- test/infiniop/hardtanh.py | 2 +- 60 files changed, 60 insertions(+), 60 deletions(-) diff --git a/include/infinicore/ops/adaptive_max_pool1d.hpp b/include/infinicore/ops/adaptive_max_pool1d.hpp index 05e49b490..51832cae0 100644 --- a/include/infinicore/ops/adaptive_max_pool1d.hpp +++ b/include/infinicore/ops/adaptive_max_pool1d.hpp @@ -13,4 +13,4 @@ class AdaptiveMaxPool1d { Tensor adaptive_max_pool1d(Tensor x, size_t output_size); void adaptive_max_pool1d_(Tensor y, Tensor x, size_t output_size); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/baddbmm.hpp b/include/infinicore/ops/baddbmm.hpp index 3c08b98d9..9988000bd 100644 --- a/include/infinicore/ops/baddbmm.hpp +++ b/include/infinicore/ops/baddbmm.hpp @@ -12,4 +12,4 @@ Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2, void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/bilinear.hpp b/include/infinicore/ops/bilinear.hpp index 3f5f44aac..805fa1efc 100644 --- a/include/infinicore/ops/bilinear.hpp +++ b/include/infinicore/ops/bilinear.hpp @@ -9,4 +9,4 @@ namespace infinicore::op { Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional bias); void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional bias); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/cross_entropy.hpp b/include/infinicore/ops/cross_entropy.hpp index 958ee1089..9a6d446d2 100644 --- a/include/infinicore/ops/cross_entropy.hpp +++ b/include/infinicore/ops/cross_entropy.hpp @@ -32,4 +32,4 @@ Tensor cross_entropy(Tensor input, Tensor target); // 所以这里只是表示“写入指定的 output 内存” void cross_entropy_(Tensor output, Tensor input, Tensor target); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infiniop/ops/adaptive_max_pool1d.h b/include/infiniop/ops/adaptive_max_pool1d.h index 67876d488..0b42844d1 100644 --- a/include/infiniop/ops/adaptive_max_pool1d.h +++ b/include/infiniop/ops/adaptive_max_pool1d.h @@ -19,4 +19,4 @@ __INFINI_C __export infiniStatus_t infiniopAdaptiveMaxPool1d(infiniopAdaptiveMax __INFINI_C __export infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(infiniopAdaptiveMaxPool1dDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h index 99bf0363e..95c86bf79 100644 --- a/include/infiniop/ops/asinh.h +++ b/include/infiniop/ops/asinh.h @@ -21,4 +21,4 @@ __INFINI_C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, __INFINI_C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/avg_pool1d.h b/include/infiniop/ops/avg_pool1d.h index 5c0bdf6ea..81c489dd7 100644 --- a/include/infiniop/ops/avg_pool1d.h +++ b/include/infiniop/ops/avg_pool1d.h @@ -29,4 +29,4 @@ __INFINI_C __export infiniStatus_t infiniopAvgPool1d( __INFINI_C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor( infiniopAvgPool1dDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/cross_entropy.h b/include/infiniop/ops/cross_entropy.h index 6c9c2a773..2ebd4b168 100644 --- a/include/infiniop/ops/cross_entropy.h +++ b/include/infiniop/ops/cross_entropy.h @@ -28,4 +28,4 @@ __INFINI_C __export infiniStatus_t infiniopCrossEntropy( __INFINI_C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor( infiniopCrossEntropyDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h index 5476f754c..90c4f3386 100644 --- a/include/infiniop/ops/equal.h +++ b/include/infiniop/ops/equal.h @@ -28,4 +28,4 @@ __INFINI_C __export infiniStatus_t infiniopEqual( __INFINI_C __export infiniStatus_t infiniopDestroyEqualDescriptor( infiniopEqualDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/fmod.h b/include/infiniop/ops/fmod.h index ec989e38e..e51f1b3c8 100644 --- a/include/infiniop/ops/fmod.h +++ b/include/infiniop/ops/fmod.h @@ -23,4 +23,4 @@ __INFINI_C __export infiniStatus_t infiniopFmod(infiniopFmodDescriptor_t desc, __INFINI_C __export infiniStatus_t infiniopDestroyFmodDescriptor(infiniopFmodDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h index ba5b43b77..1cdeecf67 100644 --- a/include/infiniop/ops/hardswish.h +++ b/include/infiniop/ops/hardswish.h @@ -26,4 +26,4 @@ __INFINI_C __export infiniStatus_t infiniopHardSwish( __INFINI_C __export infiniStatus_t infiniopDestroyHardSwishDescriptor( infiniopHardSwishDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/hardtanh.h b/include/infiniop/ops/hardtanh.h index 62f6435aa..d2f98cedd 100644 --- a/include/infiniop/ops/hardtanh.h +++ b/include/infiniop/ops/hardtanh.h @@ -24,4 +24,4 @@ __INFINI_C __export infiniStatus_t infiniopHardTanh(infiniopHardTanhDescriptor_t __INFINI_C __export infiniStatus_t infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/src/infinicore/ops/baddbmm/baddbmm.cc b/src/infinicore/ops/baddbmm/baddbmm.cc index 3a8ee1518..c4f8efaf5 100644 --- a/src/infinicore/ops/baddbmm/baddbmm.cc +++ b/src/infinicore/ops/baddbmm/baddbmm.cc @@ -97,4 +97,4 @@ void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, rearrange_(out, result); } } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/bilinear/bilinear.cc b/src/infinicore/ops/bilinear/bilinear.cc index ab88a28f9..abb9832f8 100644 --- a/src/infinicore/ops/bilinear/bilinear.cc +++ b/src/infinicore/ops/bilinear/bilinear.cc @@ -116,4 +116,4 @@ void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional