diff --git a/include/infiniop/ops/dequant/per_tensor_dequant_int8.h b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h new file mode 100644 index 000000000..9614b4303 --- /dev/null +++ b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__ +#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorDequantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc); + +__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc); + +#endif diff --git a/include/infiniop/ops/quant/per_tensor_quant_int8.h b/include/infiniop/ops/quant/per_tensor_quant_int8.h new file mode 100644 index 000000000..16e1c2bc6 --- /dev/null +++ b/include/infiniop/ops/quant/per_tensor_quant_int8.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__ +#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc); + +__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + const bool is_static, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc); + +#endif diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..d7a8d3b44 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh @@ -0,0 +1,18 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ +#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ + +template +__device__ void perTensorDequantI8SymKernel( + Tout *x, const Tin *x_packed, const float *x_scale, + int num_elements) { + + unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_size = blockDim.x * gridDim.x; + float x_scale_val = x_scale[0]; + for (int i = gid; i < num_elements; i += grid_size) { + float val = static_cast(x_packed[i]) * x_scale_val; + x[i] = static_cast(val); + } +} + +#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h new file mode 100644 index 000000000..719369f03 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h @@ -0,0 +1,57 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__ +#define __PER_TENSOR_DEQUANT_INT8_INFO_H__ + +#include "../../../../utils.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::per_tensor_dequant_int8 { + +class PerTensorDequantI8Info { +private: + PerTensorDequantI8Info() = default; + +public: + infiniDtype_t dtype, packed_type; + int num_elements; + + static utils::Result createPerTensorDequantI8Info( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { + + CHECK_OR_RETURN( + x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = x_desc->dtype(); + const infiniDtype_t packed_type = x_packed_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(packed_type, INFINI_DTYPE_I8); + + CHECK_OR_RETURN(x_desc->ndim() == 2 + && x_packed_desc->ndim() == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + auto shape = x_desc->shape(); + CHECK_SAME_SHAPE(shape, x_packed_desc->shape()); + + auto ndim = x_desc->ndim(); + + int num_elements = 1; + for (int i = 0; i < (int)ndim; i++) { + num_elements *= static_cast(shape[i]); + } + + return utils::Result(PerTensorDequantI8Info{ + dtype, + packed_type, + num_elements}); + } +}; + +} // namespace op::per_tensor_dequant_int8 + +#endif // __PER_TENSOR_DEQUANT_INT8_INFO_H__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu new file mode 100644 index 000000000..a128991c6 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cu @@ -0,0 +1,88 @@ +#include "../../../../devices/nvidia/nvidia_common.cuh" +#include "per_tensor_dequant_int8_nvidia.cuh" + +#include "../../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../reduce/cuda/reduce.cuh" +#include + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL perTensorDequantI8Sym( + Tout *x, const Tin *x_packed, const float *x_scale, int num_elements) { + perTensorDequantI8SymKernel(x, x_packed, x_scale, num_elements); +} + +namespace op::per_tensor_dequant_int8::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { + auto info = PerTensorDequantI8Info::createPerTensorDequantI8Info(x_desc, x_packed_desc, x_scale_desc, x_zero_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t per_tensor_dequant_int8Kernel(const PerTensorDequantI8Info &info, Tdata *x, const int8_t *x_packed, const float *x_scale, const float *x_zero, cudaStream_t stream) { + int num_elements = (int)info.num_elements; + + int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (x_zero == nullptr) { + perTensorDequantI8Sym + <<>>(x, x_packed, x_scale, num_elements); + } else { + return INFINI_STATUS_BAD_PARAM; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; +#define DEQUANT(BLOCK_SIZE, TDATA) \ + per_tensor_dequant_int8Kernel(_info, (TDATA *)x, (const int8_t *)x_packed, (const float *)x_scale, (const float *)x_zero, stream) +#define DEQUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return DEQUANT(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return DEQUANT(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return DEQUANT(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::per_tensor_dequant_int8::nvidia diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh new file mode 100644 index 000000000..66a7e5d03 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/nvidia/per_tensor_dequant_int8_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ +#define __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ +#include "../per_tensor_dequant_int8.h" + +DESCRIPTOR(nvidia) + +#endif // __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__ diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc new file mode 100644 index 000000000..48d416847 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc @@ -0,0 +1,102 @@ +#include "../../../operator.h" +#include "../../../handle.h" +#include "infiniop/ops/dequant/per_tensor_dequant_int8.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/per_tensor_dequant_int8_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorDequantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::per_tensor_dequant_int8::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + x_desc, \ + x_packed_desc, \ + x_scale_desc, \ + x_zero_desc); + switch (handle->device) { +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size) { + switch (desc->device_type) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->minWorkspaceSize(); \ + return INFINI_STATUS_SUCCESS; +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x, + const void *x_packed, + const void *x_scale, + const void *x_zero, + void *stream) { +#define DEQUANT(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, x, x_packed, x_scale, x_zero, stream); + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DEQUANT(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DEQUANT(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DEQUANT +} + +__INFINI_C infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h new file mode 100644 index 000000000..1ed54a8e5 --- /dev/null +++ b/src/infiniop/ops/dequant/per_tensor_dequant_int8/per_tensor_dequant_int8.h @@ -0,0 +1,40 @@ +#ifndef __PER_TENSOR_DEQUANT_I8_H__ +#define __PER_TENSOR_DEQUANT_I8_H__ + +#include "../../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::per_tensor_dequant_int8::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + PerTensorDequantI8Info _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, PerTensorDequantI8Info info, \ + size_t workspace_size, \ + infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), _info(info), _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t minWorkspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t x_packed_desc, \ + infiniopTensorDescriptor_t x_scale_desc, \ + infiniopTensorDescriptor_t x_zero_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *x, const void *x_packed, const void *x_scale, const void *x_zero, void *stream) const; \ + }; \ + } + +#endif // __PER_TENSOR_DEQUANT_I8_H__ diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh index 3c014de9b..91f36e4f9 100644 --- a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh +++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh @@ -1,5 +1,5 @@ -#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__ -#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#ifndef __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ +#define __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ #include __device__ inline int round_half_away_from_zero(float x) { @@ -55,8 +55,8 @@ __device__ void blockPerChannelQuantI8Kernel( float inv_scale = 1.0f / scale; float zero = -global_min * inv_scale - 128.0f; - x_scale[row] = (Tdata)scale; - x_zero[row] = (Tdata)zero; + x_scale[row] = scale; + x_zero[row] = zero; for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { @@ -111,7 +111,7 @@ __device__ void blockPerChannelQuantI8SymKernel( float inv_scale = 1.0f / scale; - x_scale[row] = (Tdata)scale; + x_scale[row] = scale; for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { @@ -270,4 +270,4 @@ __device__ void warpPerChannelQuantI8SymKernel( } } -#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#endif // __PER_CHANNEL_QUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h index 4d1675c8c..4f1d3be2c 100644 --- a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h +++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h @@ -1,5 +1,5 @@ -#ifndef __QUANT_H__ -#define __QUANT_H__ +#ifndef __PER_CHANNEL_QUANT_INT8_H__ +#define __PER_CHANNEL_QUANT_INT8_H__ #include "../../../operator.h" #include "info.h" @@ -37,4 +37,4 @@ }; \ } -#endif // __QUANT_H__ +#endif // __PER_CHANNEL_QUANT_INT8_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..a581dabf8 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh @@ -0,0 +1,70 @@ +#ifndef __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ +#define __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ + +#include + +__device__ inline int round_half_away_from_zero(float x) { + float ax = fabsf(x); + float r = floorf(ax + 0.5f); + return (x >= 0.0f) ? (int)r : -(int)r; +} + +template +__device__ void perTensorAbsmaxSymKernel(float *x_scale, const Tdata *x, int num_elements) { + unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_size = blockDim.x * gridDim.x; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + float thread_max = -__FLT_MAX__; + for (int ind = threadIdx.x; ind < num_elements; ind += BLOCK_SIZE) { + thread_max = fmaxf(thread_max, fabs((float)x[ind])); + } + +#if CUDART_VERSION >= 12090 + float local_max = BlockReduce(temp_storage).Reduce(thread_max, ::cuda::maximum()); +#else + float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max()); +#endif + __shared__ float global_max; + if (threadIdx.x == 0) { + global_max = local_max; + } + __syncthreads(); + float scale = global_max / 127.0f; + if (scale < 1e-8f) { + scale = 1e-8f; + } + if (gid == 0) { + x_scale[0] = scale; + } +} + +template +__device__ void perTensorQuantI8SymKernel( + int8_t *x_packed, float *x_scale, const Tdata *x, + int num_elements) { + + unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_size = blockDim.x * gridDim.x; + + float scale_val = 1.0f / x_scale[0]; + + for (int tid = gid; tid < num_elements; tid += grid_size) { + + float qf = (float)x[tid] * scale_val; + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -127) { + q = -127; + } + + x_packed[tid] = (int8_t)q; + } +} + +#endif // __PER_TENSOR_QUANT_INT8_KERNEL_CUH__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/info.h b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h new file mode 100644 index 000000000..cdcf205e0 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/info.h @@ -0,0 +1,58 @@ +#ifndef __PER_TENSOR_QUANT_INT8_INFO_H__ +#define __PER_TENSOR_QUANT_INT8_INFO_H__ + +#include "../../../../utils.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::per_tensor_quant_int8 { + +class PerTensorQuantI8Info { +private: + PerTensorQuantI8Info() = default; + +public: + infiniDtype_t dtype, packed_type; + int num_elements; + bool is_static; + + static utils::Result createPerTensorQuantI8Info( + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { + + CHECK_OR_RETURN( + x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t dtype = x_desc->dtype(); + const infiniDtype_t packed_type = x_packed_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_DTYPE(packed_type, INFINI_DTYPE_I8); + + CHECK_OR_RETURN(x_desc->ndim() == 2 + && x_packed_desc->ndim() == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + auto shape = x_desc->shape(); + CHECK_SAME_SHAPE(shape, x_packed_desc->shape()); + + auto ndim = x_desc->ndim(); + + int num_elements = 1; + for (int i = 0; i < (int)ndim; i++) { + num_elements *= static_cast(shape[i]); + } + + return utils::Result(PerTensorQuantI8Info{ + dtype, + packed_type, + num_elements}); + } +}; + +} // namespace op::per_tensor_quant_int8 + +#endif // __PER_TENSOR_QUANT_INT8_INFO_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu new file mode 100644 index 000000000..cfc211143 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cu @@ -0,0 +1,94 @@ +#include "../../../../devices/nvidia/nvidia_common.cuh" +#include "per_tensor_quant_int8_nvidia.cuh" + +#include "../../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../reduce/cuda/reduce.cuh" +#include + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL perTensorAbsmaxSym( + float *x_scale, const Tdata *x, int num_elements) { + perTensorAbsmaxSymKernel(x_scale, x, num_elements); +} + +template +INFINIOP_CUDA_KERNEL perTensorQuantI8Sym( + int8_t *x_packed, float *x_scale, const Tdata *x, int num_elements) { + perTensorQuantI8SymKernel(x_packed, x_scale, x, num_elements); +} + +namespace op::per_tensor_quant_int8::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { + auto info = PerTensorQuantI8Info::createPerTensorQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t per_tensor_quant_int8Kernel(const PerTensorQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, const bool is_static, cudaStream_t stream) { + int num_elements = (int)info.num_elements; + int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (x_zero == nullptr) { + if (is_static == false) { + perTensorAbsmaxSym + <<>>(x_scale, x, num_elements); + } + perTensorQuantI8Sym + <<>>(x_packed, x_scale, x, num_elements); + } else { + return INFINI_STATUS_BAD_PARAM; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; +#define QUANT(BLOCK_SIZE, TDATA) \ + per_tensor_quant_int8Kernel(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, is_static, stream) +#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return QUANT(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return QUANT(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return QUANT(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::per_tensor_quant_int8::nvidia diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh new file mode 100644 index 000000000..4137c2d47 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/nvidia/per_tensor_quant_int8_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ +#define __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ +#include "../per_tensor_quant_int8.h" + +DESCRIPTOR(nvidia) + +#endif // __PER_TENSOR_QUANT_INT8_NVIDIA_API_H__ diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc new file mode 100644 index 000000000..364fbe44b --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/operator.cc @@ -0,0 +1,103 @@ +#include "../../../operator.h" +#include "../../../handle.h" +#include "infiniop/ops/quant/per_tensor_quant_int8.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/per_tensor_quant_int8_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerTensorQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::per_tensor_quant_int8::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + x_packed_desc, \ + x_scale_desc, \ + x_zero_desc, \ + x_desc); + switch (handle->device) { +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size) { + switch (desc->device_type) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->minWorkspaceSize(); \ + return INFINI_STATUS_SUCCESS; +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + const bool is_static, + void *stream) { +#define QUANT(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, x_packed, x_scale, x_zero, x, is_static, stream); + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + QUANT(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + QUANT(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef QUANT +} + +__INFINI_C infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc) { +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h new file mode 100644 index 000000000..f75b91173 --- /dev/null +++ b/src/infiniop/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.h @@ -0,0 +1,40 @@ +#ifndef __PER_TENSOR_QUANT_I8_H__ +#define __PER_TENSOR_QUANT_I8_H__ + +#include "../../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::per_tensor_quant_int8::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + PerTensorQuantI8Info _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, PerTensorQuantI8Info info, \ + size_t workspace_size, \ + infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), _info(info), _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t minWorkspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_packed_desc, \ + infiniopTensorDescriptor_t x_scale_desc, \ + infiniopTensorDescriptor_t x_zero_desc, \ + infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *x_packed, void *x_scale, void *x_zero, const void *x, const bool is_static, void *stream) const; \ + }; \ + } + +#endif // __PER_TENSOR_QUANT_I8_H__ diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 275689e78..015daeca4 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -4,7 +4,7 @@ infiniopOperatorDescriptor_t, ) -from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float +from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool class OpRegister: @@ -760,6 +760,79 @@ def per_channel_quant_int8_(lib): infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def per_tensor_quant_int8_(lib): + lib.infiniopCreatePerTensorQuantI8Descriptor.restype = c_int32 + lib.infiniopCreatePerTensorQuantI8Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPerTensorQuantI8WorkspaceSize.restype = c_int32 + lib.infiniopGetPerTensorQuantI8WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPerTensorQuantI8.restype = c_int32 + lib.infiniopPerTensorQuantI8.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_bool, + c_void_p, + ] + + lib.infiniopDestroyPerTensorQuantI8Descriptor.restype = c_int32 + lib.infiniopDestroyPerTensorQuantI8Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def per_tensor_dequant_int8_(lib): + lib.infiniopCreatePerTensorDequantI8Descriptor.restype = c_int32 + lib.infiniopCreatePerTensorDequantI8Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPerTensorDequantI8WorkspaceSize.restype = c_int32 + lib.infiniopGetPerTensorDequantI8WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPerTensorDequantI8.restype = c_int32 + lib.infiniopPerTensorDequantI8.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPerTensorDequantI8Descriptor.restype = c_int32 + lib.infiniopDestroyPerTensorDequantI8Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + @OpRegister.operator def softplus_(lib): lib.infiniopCreateSoftplusDescriptor.restype = c_int32 diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py index dcbf9d1f8..6c0f26fa7 100644 --- a/test/infiniop/per_channel_quant_int8.py +++ b/test/infiniop/per_channel_quant_int8.py @@ -24,7 +24,7 @@ # ============================================================================== # These are not meant to be imported from other modules _TEST_CASES = [ - # x_shape, w_shape, symmetric, bias_exit, y_shape + # x_shape, symmetric ((8, 8), True), ((128, 512), True), ((128, 128), True), @@ -151,16 +151,16 @@ def lib_per_channel_quant_int8(): atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: - debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol) + debug(x_packed.actual_tensor(), x_p, atol=1, rtol=0) debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) if symmetric == False: debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol) - + if symmetric: - assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=1, rtol=0) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)) else: - assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=1, rtol=0) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)) diff --git a/test/infiniop/per_tensor_dequant_int8.py b/test/infiniop/per_tensor_dequant_int8.py new file mode 100644 index 000000000..634e02911 --- /dev/null +++ b/test/infiniop/per_tensor_dequant_int8.py @@ -0,0 +1,160 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # x_shape, symmetric + ((8, 8), True), + ((128, 512), True), + ((128, 128), True), + ((256, 1024), True), + ((256, 2048), True), + ((1024, 2048), True), +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def per_tensor_dequant_int8_torch(x_packed, x_scale, dtype): + fake_qweight = x_packed.to(dtype) + dq_weight = fake_qweight * x_scale + return dq_weight + + +def test( + handle, + device, + x_shape, + symmetric, + dtype=InfiniDtype.F16, + sync=None, +): + if symmetric == False: + return + print( + f"Testing Per Tensor Dequant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, None, dtype, device) + + x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, randint_low= -127, randint_high=127) + x_scale = TestTensor((1, ), None, InfiniDtype.F32, device) + if symmetric: + x_zero = None + else: + x_zero = TestTensor((1, ), None, InfiniDtype.F32, device) + + ans = per_tensor_dequant_int8_torch(x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePerTensorDequantI8Descriptor( + handle, + ctypes.byref(descriptor), + x.descriptor, + x_packed.descriptor, + x_scale.descriptor, + None if symmetric else x_zero.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + x_packed.destroy_desc() + x_scale.destroy_desc() + if symmetric == False: + x_zero.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPerTensorDequantI8WorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_per_tensor_dequant_int8(): + check_error( + LIBINFINIOP.infiniopPerTensorDequantI8( + descriptor, + workspace.data(), + workspace_size.value, + x.data(), + x_packed.data(), + x_scale.data(), + None if symmetric else x_zero.data(), + None, + ) + ) + + lib_per_tensor_dequant_int8() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol) + + assert torch.allclose(x.actual_tensor().float(), ans.float(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: per_tensor_dequant_int8_torch(x_packed.torch_tensor(), x_scale.torch_tensor(), x.torch_tensor().dtype), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_per_tensor_dequant_int8(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPerTensorDequantI8Descriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/per_tensor_quant_int8.py b/test/infiniop/per_tensor_quant_int8.py new file mode 100644 index 000000000..8b78b0e62 --- /dev/null +++ b/test/infiniop/per_tensor_quant_int8.py @@ -0,0 +1,187 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES = [ + # x_shape, symmetric, is_static + ((8, 8), True, True), + ((8, 128), True, False), + ((256, 1024), True, True), + ((1024, 2048), True, False), + ((2048, 2048), True, True), + ((4096, 2048), True, False), +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def per_tensor_quant_int8_torch(x, x_scale, symmetric, is_static): + if symmetric == False: + return + else: + x = x.float() + if is_static: + x_q = x.mul(1 / x_scale) + x_q = torch.round(x_q).to(torch.int8) + return x_q, x_scale, None + else: + absmax = x.flatten().abs().max() + if absmax == 0: + scale = torch.tensor(1.0, device=x.device, dtype=torch.float32) + q = torch.zeros_like(x, dtype=torch.int8) + return q, scale, None + scale = absmax / 127 + x_q = x.mul(127 / absmax) + x_q = torch.round(x_q).to(torch.int8) + + return x_q, scale, None + +def test( + handle, + device, + x_shape, + symmetric, + is_static, + dtype=InfiniDtype.F16, + sync=None, +): + + print( + f"Testing Per Tensor Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric}, is_static:{is_static} dtype:{InfiniDtypeNames[dtype]}" + ) + M, K = x_shape + + x = TestTensor(x_shape, None, dtype, device) + x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros") + if is_static == False: + x_scale = TestTensor((1,), None, InfiniDtype.F32, device, mode="zeros") + else: + x_scale = TestTensor((1,), None, InfiniDtype.F32, device) + if symmetric: + x_zero = None + else: + x_zero = TestTensor((1, ), None, InfiniDtype.F32, device) + if sync is not None: + sync() + + x_p, x_s, x_z = per_tensor_quant_int8_torch(x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static) + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePerTensorQuantI8Descriptor( + handle, + ctypes.byref(descriptor), + x_packed.descriptor, + x_scale.descriptor, + None if symmetric else x_zero.descriptor, + x.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + x_packed.destroy_desc() + x_scale.destroy_desc() + if symmetric == False: + x_zero.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPerTensorQuantI8WorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_per_tensor_quant_int8(): + check_error( + LIBINFINIOP.infiniopPerTensorQuantI8( + descriptor, + workspace.data(), + workspace_size.value, + x_packed.data(), + x_scale.data(), + None if symmetric else x_zero.data(), + x.data(), + is_static, + None, + ) + ) + + lib_per_tensor_quant_int8() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(x_packed.actual_tensor(), x_p, atol=1, rtol=0) + debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) + if symmetric == False: + debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol) + + if symmetric: + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=1, rtol=0) and + torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)) + else: + assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=1, rtol=0) and + torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and + torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: per_tensor_quant_int8_torch(x.torch_tensor(), x_scale.torch_tensor(), symmetric, is_static), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_per_tensor_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPerTensorQuantI8Descriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m")