InfiniTensor · xgqdut2016 · Mar 6, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/include/infiniop/ops/dequant/per_tensor_dequant_int8.h b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
+                                                                              infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
+                                                                              infiniopTensorDescriptor_t x_desc,
+                                                                              infiniopTensorDescriptor_t x_packed_desc,
+                                                                              infiniopTensorDescriptor_t x_scale_desc,
+                                                                              infiniopTensorDescriptor_t x_zero_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
+                                                              void *workspace,
+                                                              size_t workspace_size,
+                                                              void *x,
+                                                              const void *x_packed,
+                                                              const void *x_scale,
+                                                              const void *x_zero,
+                                                              void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/quant/per_tensor_quant_int8.h b/include/infiniop/ops/quant/per_tensor_quant_int8.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
+                                                                            infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
+                                                                            infiniopTensorDescriptor_t x_packed_desc,
+                                                                            infiniopTensorDescriptor_t x_scale_desc,
+                                                                            infiniopTensorDescriptor_t x_zero_desc,
+                                                                            infiniopTensorDescriptor_t x_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
+                                                            void *workspace,
+                                                            size_t workspace_size,
+                                                            void *x_packed,
+                                                            void *x_scale,
+                                                            void *x_zero,
+                                                            const void *x,
+                                                            const bool is_static,
+                                                            void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
@@ -0,0 +1,18 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+
+template <typename Tin, typename Tout>
+__device__ void perTensorDequantI8SymKernel(
+    Tout *x, const Tin *x_packed, const float *x_scale,
+    int num_elements) {
+
+    unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int grid_size = blockDim.x * gridDim.x;
+    float x_scale_val = x_scale[0];
+    for (int i = gid; i < num_elements; i += grid_size) {
+        float val = static_cast<float>(x_packed[i]) * x_scale_val;
+        x[i] = static_cast<Tout>(val);
+    }
+}
+
+#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
@@ -0,0 +1,57 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__
+#define __PER_TENSOR_DEQUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_tensor_dequant_int8 {
+
+class PerTensorDequantI8Info {
+private:
+    PerTensorDequantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    int num_elements;
+
+    static utils::Result<PerTensorDequantI8Info> createPerTensorDequantI8Info(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 2
+                            && x_packed_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto shape = x_desc->shape();
+        CHECK_SAME_SHAPE(shape, x_packed_desc->shape());
+
+        auto ndim = x_desc->ndim();
+
+        int num_elements = 1;
+        for (int i = 0; i < (int)ndim; i++) {
+            num_elements *= static_cast<int>(shape[i]);
+        }
+
+        return utils::Result<PerTensorDequantI8Info>(PerTensorDequantI8Info{
+            dtype,
+            packed_type,
+            num_elements});
+    }
+};
+
+} // namespace op::per_tensor_dequant_int8
+
+#endif //  __PER_TENSOR_DEQUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu
@@ -0,0 +1,88 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_tensor_dequant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tin, typename Tout>
+INFINIOP_CUDA_KERNEL perTensorDequantI8Sym(
+    Tout *x, const Tin *x_packed, const float *x_scale, int num_elements) {
+    perTensorDequantI8SymKernel<Tin, Tout>(x, x_packed, x_scale, num_elements);
+}
+
+namespace op::per_tensor_dequant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc) {
+    auto info = PerTensorDequantI8Info::createPerTensorDequantI8Info(x_desc, x_packed_desc, x_scale_desc, x_zero_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_tensor_dequant_int8Kernel(const PerTensorDequantI8Info &info, Tdata *x, const int8_t *x_packed, const float *x_scale, const float *x_zero, cudaStream_t stream) {
+    int num_elements = (int)info.num_elements;
+
+    int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    if (x_zero == nullptr) {
+        perTensorDequantI8Sym<int8_t, Tdata>
+            <<<num_blocks, BLOCK_SIZE, 0, stream>>>(x, x_packed, x_scale, num_elements);
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x,
+                                     const void *x_packed,
+                                     const void *x_scale,
+                                     const void *x_zero,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define DEQUANT(BLOCK_SIZE, TDATA) \
+    per_tensor_dequant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)x, (const int8_t *)x_packed, (const float *)x_scale, (const float *)x_zero, stream)
+#define DEQUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                  \
+        if (_info.dtype == INFINI_DTYPE_F16)           \
+            return DEQUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)      \
+            return DEQUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)     \
+            return DEQUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                           \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;     \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_tensor_dequant_int8::nvidia
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
+#define __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
+#include "../per_tensor_dequant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc
@@ -0,0 +1,102 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_tensor_dequant_int8_nvidia.cuh"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
+                                                                     infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
+                                                                     infiniopTensorDescriptor_t x_desc,
+                                                                     infiniopTensorDescriptor_t x_packed_desc,
+                                                                     infiniopTensorDescriptor_t x_scale_desc,
+                                                                     infiniopTensorDescriptor_t x_zero_desc) {
+#define CREATE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        return op::per_tensor_dequant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                            \
+            reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                                            \
+            x_packed_desc,                                                                     \
+            x_scale_desc,                                                                      \
+            x_zero_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                      \
+    case CASE:                                                                                                    \
+        *size = reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
+                                                     void *workspace,
+                                                     size_t workspace_size,
+                                                     void *x,
+                                                     const void *x_packed,
+                                                     const void *x_scale,
+                                                     const void *x_zero,
+                                                     void *stream) {
+#define DEQUANT(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                          \
+        return reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x, x_packed, x_scale, x_zero, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DEQUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DEQUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DEQUANT
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                               \
+        delete reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __PER_TENSOR_DEQUANT_I8_H__
+#define __PER_TENSOR_DEQUANT_I8_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                            \
+                                                                                                         \
+    namespace op::per_tensor_dequant_int8::NAMESPACE {                                                   \
+    class Descriptor final : public InfiniopDescriptor {                                                 \
+        struct Opaque;                                                                                   \
+        Opaque *_opaque;                                                                                 \
+        PerTensorDequantI8Info _info;                                                                    \
+        size_t _workspace_size;                                                                          \
+                                                                                                         \
+        Descriptor(Opaque *opaque, PerTensorDequantI8Info info,                                          \
+                   size_t workspace_size,                                                                \
+                   infiniDevice_t device_type, int device_id)                                            \
+            : InfiniopDescriptor{device_type, device_id},                                                \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                           \
+                                                                                                         \
+    public:                                                                                              \
+        ~Descriptor();                                                                                   \
+                                                                                                         \
+        size_t minWorkspaceSize() const { return _workspace_size; }                                      \
+                                                                                                         \
+        static infiniStatus_t create(                                                                    \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                              \
+            infiniopTensorDescriptor_t x_desc,                                                           \
+            infiniopTensorDescriptor_t x_packed_desc,                                                    \
+            infiniopTensorDescriptor_t x_scale_desc,                                                     \
+            infiniopTensorDescriptor_t x_zero_desc);                                                     \
+                                                                                                         \
+        infiniStatus_t calculate(                                                                        \
+            void *workspace, size_t workspace_size,                                                      \
+            void *x, const void *x_packed, const void *x_scale, const void *x_zero, void *stream) const; \
+    };                                                                                                   \
+    }
+
+#endif // __PER_TENSOR_DEQUANT_I8_H__