Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions include/infiniop/ops/dequant/per_tensor_dequant_int8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc);

__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x,
const void *x_packed,
const void *x_scale,
const void *x_zero,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc);

#endif
29 changes: 29 additions & 0 deletions include/infiniop/ops/quant/per_tensor_quant_int8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);

__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
const bool is_static,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc);

#endif
18 changes: 18 additions & 0 deletions src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__

template <typename Tin, typename Tout>
__device__ void perTensorDequantI8SymKernel(
Tout *x, const Tin *x_packed, const float *x_scale,
int num_elements) {

unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
const int grid_size = blockDim.x * gridDim.x;
float x_scale_val = x_scale[0];
for (int i = gid; i < num_elements; i += grid_size) {
float val = static_cast<float>(x_packed[i]) * x_scale_val;
x[i] = static_cast<Tout>(val);
}
}

#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
57 changes: 57 additions & 0 deletions src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__
#define __PER_TENSOR_DEQUANT_INT8_INFO_H__

#include "../../../../utils.h"
#include "../../../operator.h"
#include "../../../tensor.h"

namespace op::per_tensor_dequant_int8 {

class PerTensorDequantI8Info {
private:
PerTensorDequantI8Info() = default;

public:
infiniDtype_t dtype, packed_type;
int num_elements;

static utils::Result<PerTensorDequantI8Info> createPerTensorDequantI8Info(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc) {

CHECK_OR_RETURN(
x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
INFINI_STATUS_NULL_POINTER);

const infiniDtype_t dtype = x_desc->dtype();
const infiniDtype_t packed_type = x_packed_desc->dtype();

CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);

CHECK_OR_RETURN(x_desc->ndim() == 2
&& x_packed_desc->ndim() == 2,
INFINI_STATUS_BAD_TENSOR_SHAPE);

auto shape = x_desc->shape();
CHECK_SAME_SHAPE(shape, x_packed_desc->shape());

auto ndim = x_desc->ndim();

int num_elements = 1;
for (int i = 0; i < (int)ndim; i++) {
num_elements *= static_cast<int>(shape[i]);
}

return utils::Result<PerTensorDequantI8Info>(PerTensorDequantI8Info{
dtype,
packed_type,
num_elements});
}
};

} // namespace op::per_tensor_dequant_int8

#endif // __PER_TENSOR_DEQUANT_INT8_INFO_H__
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "../../../../devices/nvidia/nvidia_common.cuh"
#include "per_tensor_dequant_int8_nvidia.cuh"

#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../../../../reduce/cuda/reduce.cuh"
#include <cub/block/block_reduce.cuh>

#include "../cuda/kernel.cuh"

template <typename Tin, typename Tout>
INFINIOP_CUDA_KERNEL perTensorDequantI8Sym(
Tout *x, const Tin *x_packed, const float *x_scale, int num_elements) {
perTensorDequantI8SymKernel<Tin, Tout>(x, x_packed, x_scale, num_elements);
}

namespace op::per_tensor_dequant_int8::nvidia {

struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};

Descriptor::~Descriptor() {
delete _opaque;
}

infiniStatus_t Descriptor::create(
infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc) {
auto info = PerTensorDequantI8Info::createPerTensorDequantI8Info(x_desc, x_packed_desc, x_scale_desc, x_zero_desc);
CHECK_RESULT(info);

*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}

template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t per_tensor_dequant_int8Kernel(const PerTensorDequantI8Info &info, Tdata *x, const int8_t *x_packed, const float *x_scale, const float *x_zero, cudaStream_t stream) {
int num_elements = (int)info.num_elements;

int num_blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;

if (x_zero == nullptr) {
perTensorDequantI8Sym<int8_t, Tdata>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(x, x_packed, x_scale, num_elements);
} else {
return INFINI_STATUS_BAD_PARAM;
}
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *x,
const void *x_packed,
const void *x_scale,
const void *x_zero,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
#define DEQUANT(BLOCK_SIZE, TDATA) \
per_tensor_dequant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)x, (const int8_t *)x_packed, (const float *)x_scale, (const float *)x_zero, stream)
#define DEQUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return DEQUANT(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return DEQUANT(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return DEQUANT(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
DEQUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}

} // namespace op::per_tensor_dequant_int8::nvidia
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
#define __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
#include "../per_tensor_dequant_int8.h"

DESCRIPTOR(nvidia)

#endif // __PER_TENSOR_DEQUANT_INT8_NVIDIA_API_H__
102 changes: 102 additions & 0 deletions src/infiniop/ops/dequant/per_tensor_dequant_int8/operator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#include "../../../operator.h"
#include "../../../handle.h"
#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"

#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/per_tensor_dequant_int8_nvidia.cuh"
#endif

__INFINI_C infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::per_tensor_dequant_int8::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
x_desc, \
x_packed_desc, \
x_scale_desc, \
x_zero_desc);
switch (handle->device) {
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia)
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}

__INFINI_C infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size) {
switch (desc->device_type) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
return INFINI_STATUS_SUCCESS;
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}

__INFINI_C infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x,
const void *x_packed,
const void *x_scale,
const void *x_zero,
void *stream) {
#define DEQUANT(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, x, x_packed, x_scale, x_zero, stream);

switch (desc->device_type) {
#ifdef ENABLE_NVIDIA_API
DEQUANT(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_QY_API
DEQUANT(INFINI_DEVICE_QY, nvidia)
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DEQUANT
}

__INFINI_C infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::per_tensor_dequant_int8::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;

switch (desc->device_type) {
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DESTROY
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef __PER_TENSOR_DEQUANT_I8_H__
#define __PER_TENSOR_DEQUANT_I8_H__

#include "../../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::per_tensor_dequant_int8::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
PerTensorDequantI8Info _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, PerTensorDequantI8Info info, \
size_t workspace_size, \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), _info(info), _workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t minWorkspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, Descriptor **desc_ptr, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t x_packed_desc, \
infiniopTensorDescriptor_t x_scale_desc, \
infiniopTensorDescriptor_t x_zero_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *x, const void *x_packed, const void *x_scale, const void *x_zero, void *stream) const; \
}; \
}

#endif // __PER_TENSOR_DEQUANT_I8_H__
Loading