From b2c9376f9f23e19f00ca583dd9593f67f79b0637 Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 11 Feb 2026 12:15:57 -0800 Subject: [PATCH] [ET-VK] Layout-flexible impl of quantized binary This refactors the quantized binary add operator to support all PackedInt8 memory layouts (4W, 4C, 4W4C, 4H4W, 4C1W) instead of being hardcoded to 4W4C. The shader is rewritten to use the block indexing framework (BlockConfig, block_int8x4_load/store) and BufferMetadata for layout-agnostic tensor access, replacing the previous linear dispatch that assumed 4W4C ordering. Key changes: - Renames shader from binary_q8ta_q8ta_q8to to q8ta_binary, and op from add_q8ta_q8ta_q8to to q8ta_add - Shader now uses contiguous_block_idx_to_tensor4d_idx_with_block_config for dispatch and generated load/store functions for layout-flexible int8x4 access - C++ dispatch uses pick_linear_global_wg_with_block_config and passes BufferMetadata UBOs for output and both inputs, plus hashed_layout specialization constants - Moves the test operator into a separate TestQ8taBinary.cpp file that parameterizes on GPUMemoryLayout, testing all 5 layouts - Updates op_registry to accept PACKED_INT8_BUFFER (all layouts) instead of just PACKED_INT8_4W4C_BUFFER This diff was authored with Claude. Differential Revision: [D93000170](https://our.internmc.facebook.com/intern/diff/D93000170/) [ghstack-poisoned] --- backends/vulkan/custom_ops_lib.py | 10 +- backends/vulkan/op_registry.py | 8 +- backends/vulkan/patterns/quantized_binary.py | 2 +- .../graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl | 76 ---- .../runtime/graph/ops/glsl/q8ta_binary.glsl | 91 +++++ ...y_q8ta_q8ta_q8to.yaml => q8ta_binary.yaml} | 11 +- .../{QuantizedBinary.cpp => Q8taBinary.cpp} | 124 ++---- .../runtime/graph/ops/impl/Q8taBinary.h | 33 ++ .../vulkan/test/custom_ops/CMakeLists.txt | 2 +- .../test/custom_ops/impl/TestQ8taBinary.cpp | 88 +++++ .../test/custom_ops/q8ta_q8ta_q8to_add.cpp | 258 ------------ backends/vulkan/test/custom_ops/targets.bzl | 2 +- .../test/custom_ops/test_q8ta_binary.cpp | 373 ++++++++++++++++++ 13 files changed, 623 insertions(+), 455 deletions(-) delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl rename backends/vulkan/runtime/graph/ops/glsl/{binary_q8ta_q8ta_q8to.yaml => q8ta_binary.yaml} (61%) rename backends/vulkan/runtime/graph/ops/impl/{QuantizedBinary.cpp => Q8taBinary.cpp} (53%) create mode 100644 backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h create mode 100644 backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp delete mode 100644 backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp create mode 100644 backends/vulkan/test/custom_ops/test_q8ta_binary.cpp diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py index ad65ae47a40..92bd9574dfc 100644 --- a/backends/vulkan/custom_ops_lib.py +++ b/backends/vulkan/custom_ops_lib.py @@ -564,11 +564,11 @@ def apply_rotary_emb_impl( apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name) ######################## -## add_q8ta_q8ta_q8to ## +## q8ta_add ## ######################## -def add_q8ta_q8ta_q8to_impl( +def q8ta_add_impl( input_a: torch.Tensor, input_b: torch.Tensor, input_a_scale: float, @@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl( return quantized_result -name = "add_q8ta_q8ta_q8to" +name = "q8ta_add" lib.define( f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor" ) -lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd") -add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name) +lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd") +q8ta_add_op = getattr(getattr(torch.ops, namespace), name) ############################# ## select_as_symint ## diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 28df2e7001d..0725a39f547 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -495,14 +495,14 @@ def register_torchao_choose_qparams_affine(): # ============================================================================= -# QuantizedBinary.cpp +# Q8taBinary.cpp # ============================================================================= -@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default) -def register_add_q8ta_q8ta_q8to(): +@update_features(exir_ops.edge.et_vk.q8ta_add.default) +def register_q8ta_add(): return OpFeatures( - inputs_storage=utils.PACKED_INT8_4W4C_BUFFER, + inputs_storage=utils.PACKED_INT8_BUFFER, supports_resize=False, supports_prepacking=True, ) diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py index da4985b931d..9a18f148736 100644 --- a/backends/vulkan/patterns/quantized_binary.py +++ b/backends/vulkan/patterns/quantized_binary.py @@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op( exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.add_.Tensor, }: - op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default + op_target = exir_ops.edge.et_vk.q8ta_add.default else: # For future binary operations, add more mappings here raise NotImplementedError( diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl deleted file mode 100644 index c5dac0d6571..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -${define_required_extensions("buffer", DTYPE)} - -#define PRECISION ${PRECISION} - -#define NAME ${VARIANT_NAME} - -#define VEC4_T ${texel_load_type(DTYPE, "buffer")} -#define T ${texel_load_component_type(DTYPE, "buffer")} - -$if IO_STORAGE == "buffer": - #define PACKED_INT8_OUTPUT_BUFFER - #define PACKED_INT8_INPUT_BUFFER - -#define op(X, Y) ${OPERATOR} - -layout(std430) buffer; - -#include "indexing.glslh" -#include "common.glslh" - -${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "out_sizes")} - -layout(push_constant) uniform restrict Block { - float input_a_scale; - int input_a_zp; - float input_b_scale; - int input_b_zp; - float output_inv_scale; - int output_zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int tid = int(gl_GlobalInvocationID.x); - - const int W4 = div_up_4(out_sizes.x); - const int H = out_sizes.y; - const int C4 = div_up_4(out_sizes.z); - const int N = out_sizes.w; - - if (tid >= W4 * H * C4 * N) { - return; - } - - const ivec4 in_block_1 = t_packed_int8_in_a[tid]; - const ivec4 in_block_2 = t_packed_int8_in_b[tid]; - - ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp))); - - for (int row = 0; row < 4; row++) { - vec4 in_texel_1 = unpack_and_dequantize( - in_block_1[row], input_a_scale, input_a_zp); - vec4 in_texel_2 = unpack_and_dequantize( - in_block_2[row], input_b_scale, input_b_zp); - - vec4 out_texel = op(in_texel_1, in_texel_2); - out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp); - } - - t_packed_int8_out[tid] = out_block; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl new file mode 100644 index 00000000000..60f437fbdce --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("buffer")} + +#define op(X, Y) ${OPERATOR} + +layout(std430) buffer; + +#include "indexing.glslh" +#include "common.glslh" +#include "block_indexing.glslh" +#include "block_int8x4_load.glslh" +#include "block_int8x4_store.glslh" + +// Output buffer: packed int8x4 values +${layout_declare_tensor(B, "w", "t_out", "int", "buffer")} +// Input buffers: packed int8x4 values +${layout_declare_tensor(B, "r", "t_in_a", "int", "buffer")} +${layout_declare_tensor(B, "r", "t_in_b", "int", "buffer")} + +// Metadata for output and input tensors +${layout_declare_ubo(B, "BufferMetadata", "out_meta")} +${layout_declare_ubo(B, "BufferMetadata", "in_a_meta")} +${layout_declare_ubo(B, "BufferMetadata", "in_b_meta")} + +layout(push_constant) uniform restrict Block { + float input_a_scale; + int input_a_zp; + float input_b_scale; + int input_b_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "block_config", "0")} + +// Generate loading functions for input buffers +define_load_int8x4_buffer_fns(t_in_a) +define_load_int8x4_buffer_fns(t_in_b) + +// Generate storing functions for output buffer +define_store_int8x4_buffer_fns(t_out) + +void main() { + // Buffer storage: use linear dispatch + const uint contig_block_idx = gl_GlobalInvocationID.x; + TensorIndex4D tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config( + out_meta, contig_block_idx, block_config); + + if (out_of_bounds(tidx, out_meta)) { + return; + } + + const int block_outer_dim = get_block_outer_dim(block_config); + + // Load int8x4 blocks from both inputs + ivec4 in_block_a = load_int8x4_block_from_t_in_a( + in_a_meta, tidx, in_layout, block_outer_dim); + ivec4 in_block_b = load_int8x4_block_from_t_in_b( + in_b_meta, tidx, in_layout, block_outer_dim); + + ivec4 out_block; + + for (int row = 0; row < 4; row++) { + vec4 in_texel_a = unpack_and_dequantize( + in_block_a[row], input_a_scale, input_a_zp); + vec4 in_texel_b = unpack_and_dequantize( + in_block_b[row], input_b_scale, input_b_zp); + + vec4 out_texel = op(in_texel_a, in_texel_b); + out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp); + } + + // Store to output buffer + store_int8x4_block_to_t_out( + out_meta, tidx, out_layout, block_outer_dim, out_block); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml similarity index 61% rename from backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml rename to backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml index e19ed8839eb..2060f7e42ba 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml @@ -4,16 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -binary_q8ta_q8ta_q8to: +q8ta_binary: parameter_names_with_default_values: OPERATOR: X + Y - NDIM: 3 - DTYPE: float - PACKING: C_packed - IO_STORAGE: buffer - generate_variant_forall: - IO_STORAGE: - - VALUE: buffer shader_variants: - - NAME: add_q8ta_q8ta_q8to + - NAME: q8ta_add_buffer OPERATOR: X + Y diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp similarity index 53% rename from backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp rename to backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp index 99b5880c2eb..c7030b64ee4 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp @@ -9,38 +9,15 @@ #include #include -#include -#include #include namespace vkcompute { -// -// Shader dispatch utilities -// - -utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef packed_int8_output = args.at(0).refs.at(0); - - const uint32_t W = graph->size_at(-1, packed_int8_output); - const uint32_t H = graph->size_at(-2, packed_int8_output); - const uint32_t C = graph->size_at(-3, packed_int8_output); - - const uint32_t W4 = utils::div_up_4(W); - const uint32_t C4 = utils::div_up_4(C); - - return {W4 * H * C4, 1, 1}; -} - // // Dispatch nodes // -void add_q8ta_q8ta_q8to_binary_node( +void add_q8ta_binary_node( ComputeGraph& graph, const ValueRef packed_int8_input_a, const ValueRef packed_int8_input_b, @@ -68,11 +45,15 @@ void add_q8ta_q8ta_q8to_binary_node( alpha_val = graph.extract_scalar(alpha); } - std::string kernel_name = op_name + "_q8ta_q8ta_q8to"; + std::string kernel_name = "q8ta_" + op_name; add_storage_type_suffix( kernel_name, graph.storage_type_of(packed_int8_output)); - vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)}; + // Pass metadata for output and input tensors + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(packed_int8_output)); + param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_a)); + param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_b)); std::vector push_constants = { PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)), @@ -84,11 +65,19 @@ void add_q8ta_q8ta_q8to_binary_node( PushConstantDataInfo(&alpha_val, sizeof(alpha_val)), }; + // Create block config for output tensor: inner_dim = output's packed_dim + const BlockConfig block_config = + create_block_config_for_tensor(graph, packed_int8_output); + + // Cast block config to ValueRef for pick_linear_global_wg_with_block_config + const ValueRef block_config_ref = + static_cast(block_config.as_packed_int()); + graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), - pick_q8ta_q8ta_q8to_binary_global_wg_size, - default_pick_local_wg_size, + pick_linear_global_wg_with_block_config, + pick_square_local_wg_with_block_config, // Inputs and Outputs {{packed_int8_output, vkapi::kWrite}, {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}}, @@ -97,9 +86,11 @@ void add_q8ta_q8ta_q8to_binary_node( // Push Constants push_constants, // Specialization Constants - {}, + {graph.hashed_layout_of(packed_int8_output), + graph.hashed_layout_of(packed_int8_input_a), + block_config.as_packed_int()}, // Resize args - {}, + {block_config_ref}, // Resizing Logic nullptr)); } @@ -108,9 +99,7 @@ void add_q8ta_q8ta_q8to_binary_node( // High level operator impl // -void add_q8ta_q8ta_q8to( - ComputeGraph& graph, - const std::vector& args) { +void q8ta_add(ComputeGraph& graph, const std::vector& args) { int32_t idx = 0; const ValueRef packed_int8_input_a = args.at(idx++); const ValueRef packed_int8_input_b = args.at(idx++); @@ -123,7 +112,7 @@ void add_q8ta_q8ta_q8to( const ValueRef alpha = args.at(idx++); const ValueRef packed_int8_output = args.at(idx++); - add_q8ta_q8ta_q8to_binary_node( + add_q8ta_binary_node( graph, packed_int8_input_a, packed_int8_input_b, @@ -138,73 +127,8 @@ void add_q8ta_q8ta_q8to( "add"); } -// -// Test operators -// - -void add_q8ta_q8ta_q8to_test( - ComputeGraph& graph, - const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input_a = args.at(idx++); - const ValueRef fp_input_b = args.at(idx++); - const ValueRef input_a_scale = args.at(idx++); - const ValueRef input_a_zp = args.at(idx++); - const ValueRef input_b_scale = args.at(idx++); - const ValueRef input_b_zp = args.at(idx++); - const ValueRef output_scale = args.at(idx++); - const ValueRef output_zp = args.at(idx++); - const ValueRef alpha = args.at(idx++); - const ValueRef fp_output = args.at(idx++); - - TmpTensor packed_int8_input_a( - &graph, - graph.sizes_of(fp_input_a), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - TmpTensor packed_int8_input_b( - &graph, - graph.sizes_of(fp_input_b), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - TmpTensor packed_int8_output( - &graph, - graph.sizes_of(fp_output), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - add_quantize_and_pack_4w4c_node( - graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a); - - add_quantize_and_pack_4w4c_node( - graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b); - - std::vector add_args = { - packed_int8_input_a, - packed_int8_input_b, - input_a_scale, - input_a_zp, - input_b_scale, - input_b_zp, - output_scale, - output_zp, - alpha, - packed_int8_output}; - - add_q8ta_q8ta_q8to(graph, add_args); - - add_unpack_4w4c_and_dequantize_node( - graph, packed_int8_output, output_scale, output_zp, fp_output); -} - REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to); - VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test); + VK_REGISTER_OP(et_vk.q8ta_add.default, q8ta_add); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h new file mode 100644 index 00000000000..512849762cb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +// +// Binary operations for int8x4 tensors +// + +void add_q8ta_binary_node( + ComputeGraph& graph, + const ValueRef packed_int8_input_a, + const ValueRef packed_int8_input_b, + const ValueRef input_a_scale, + const ValueRef input_a_zp, + const ValueRef input_b_scale, + const ValueRef input_b_zp, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef alpha, + const ValueRef packed_int8_output, + const std::string& op_name); + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 0121c84bb5b..0777ad76a8d 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -99,8 +99,8 @@ if(TARGET vulkan_backend) add_operator_prototype(choose_qparams_per_row) add_operator_prototype(test_q8ta_qdq) add_operator_prototype(test_q8ta_clone) + add_operator_prototype(test_q8ta_add) add_operator_prototype(test_q8ta_conv2d) add_operator_prototype(test_q8ta_conv2d_pw) add_operator_prototype(test_q8ta_conv2d_dw) - add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp new file mode 100644 index 00000000000..53f8859b581 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +void q8ta_add_test(ComputeGraph& graph, const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input_a = args.at(idx++); + const ValueRef fp_input_b = args.at(idx++); + const ValueRef input_a_scale = args.at(idx++); + const ValueRef input_a_zp = args.at(idx++); + const ValueRef input_b_scale = args.at(idx++); + const ValueRef input_b_zp = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef alpha = args.at(idx++); + const ValueRef quant_layout_int = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(quant_layout_int); + utils::GPUMemoryLayout quant_layout = + static_cast(layout_value); + + // Create temporary tensors for quantized data with the specified layout + TmpTensor packed_int8_input_a( + &graph, + graph.sizes_of(fp_input_a), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + TmpTensor packed_int8_input_b( + &graph, + graph.sizes_of(fp_input_b), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + // Quantize: FP -> int8x4 with specified layout + add_q8ta_quantize_node( + graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a); + + add_q8ta_quantize_node( + graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b); + + // Binary add: int8x4 -> int8x4 (same layout for all tensors) + add_q8ta_binary_node( + graph, + packed_int8_input_a, + packed_int8_input_b, + input_a_scale, + input_a_zp, + input_b_scale, + input_b_zp, + output_scale, + output_zp, + alpha, + packed_int8_output, + "add"); + + // Dequantize: int8x4 -> FP + add_q8ta_dequantize_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.q8ta_add.test, q8ta_add_test); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp deleted file mode 100644 index eb8e6908060..00000000000 --- a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -using namespace executorch::vulkan::prototyping; - -// Utility function to create a test case for quantized add operation -TestCase create_quantized_add_test_case( - const std::vector& sizes, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string size_str = ""; - for (size_t i = 0; i < sizes.size(); ++i) { - size_str += std::to_string(sizes[i]); - if (i < sizes.size() - 1) - size_str += "x"; - } - - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test"); - - utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer - ? utils::kWidthPacked - : utils::kChannelsPacked; - - // Input tensor A (float/half) - ValueSpec input_a( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM); - - // Input tensor B (float/half) - ValueSpec input_b( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM); - - // Quantization parameters for input A - float input_a_scale_val = 0.007843; // 2/255 approximately - ValueSpec input_a_scale(input_a_scale_val); - - int32_t input_a_zero_point_val = 3; - ValueSpec input_a_zero_point(input_a_zero_point_val); - - // Quantization parameters for input B - float input_b_scale_val = 0.009412; // 2.4/255 approximately - ValueSpec input_b_scale(input_b_scale_val); - - int32_t input_b_zero_point_val = -2; - ValueSpec input_b_zero_point(input_b_zero_point_val); - - // Output quantization parameters - float output_scale_val = 0.015686; // 4/255 approximately - ValueSpec output_scale(output_scale_val); - - int32_t output_zero_point_val = 1; - ValueSpec output_zero_point(output_zero_point_val); - - // Alpha parameter - float alpha_val = 1.0f; - ValueSpec alpha(alpha_val); - - // Output tensor (float/half) - ValueSpec output( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS); - - // Add all specs to test case for q8ta_q8ta_q8to add operation - test_case.add_input_spec(input_a); - test_case.add_input_spec(input_b); - test_case.add_input_spec(input_a_scale); - test_case.add_input_spec(input_a_zero_point); - test_case.add_input_spec(input_b_scale); - test_case.add_input_spec(input_b_zero_point); - test_case.add_input_spec(output_scale); - test_case.add_input_spec(output_zero_point); - test_case.add_input_spec(alpha); - - test_case.add_output_spec(output); - - test_case.set_abs_tolerance(output_scale_val + 1e-4f); - - return test_case; -} - -// Generate test cases for quantized add operation -std::vector generate_quantized_add_test_cases() { - std::vector test_cases; - - // Define different input size configurations - std::vector> size_configs = { - {3, 32, 32}, // Small square - {8, 64, 64}, // Medium square - {16, 16, 16}, // 3D cube - {8, 32, 16}, // 3D rectangular - {7, 7, 13}, // Irregular sizes - }; - - // Storage types to test - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - // Data types to test - std::vector data_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& sizes : size_configs) { - for (const auto& storage_type : storage_types) { - for (const auto& data_type : data_types) { - test_cases.push_back( - create_quantized_add_test_case(sizes, storage_type, data_type)); - } - } - } - - return test_cases; -} - -// Reference implementation for quantized add operation -void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) { - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_a_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_spec = test_case.inputs()[idx++]; - const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& alpha_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_a_spec.get_tensor_sizes(); - int64_t num_elements = input_a_spec.numel(); - - if (input_a_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_a_data = input_a_spec.get_float_data(); - auto& input_b_data = input_b_spec.get_float_data(); - - const float input_a_scale = input_a_scale_spec.get_float_value(); - const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value(); - const float input_b_scale = input_b_scale_spec.get_float_value(); - const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value(); - const float output_scale = output_scale_spec.get_float_value(); - const int32_t output_zero_point = output_zero_point_spec.get_int_value(); - const float alpha = alpha_spec.get_float_value(); - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_elements); - - // Perform quantized add operation - for (int64_t i = 0; i < num_elements; ++i) { - // Quantize input A to int8 - float quant_a_f = - std::round(input_a_data[i] / input_a_scale) + input_a_zero_point; - quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f); - int8_t quantized_a = static_cast(quant_a_f); - - // Quantize input B to int8 - float quant_b_f = - std::round(input_b_data[i] / input_b_scale) + input_b_zero_point; - quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f); - int8_t quantized_b = static_cast(quant_b_f); - - // Dequantize both inputs to a common scale for addition - float dequant_a = - (static_cast(quantized_a) - input_a_zero_point) * input_a_scale; - float dequant_b = - (static_cast(quantized_b) - input_b_zero_point) * input_b_scale; - - // Perform addition in float space with alpha - float float_result = dequant_a + alpha * dequant_b; - - // Quantize the result to int8 - float quant_output_f = - std::round(float_result / output_scale) + output_zero_point; - quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); - int8_t quantized_output = static_cast(quant_output_f); - - // Dequantize back to float for comparison - float dequant_output = - (static_cast(quantized_output) - output_zero_point) * - output_scale; - - ref_data[i] = dequant_output; - } -} - -void reference_impl(TestCase& test_case) { - add_q8ta_q8ta_q8to_reference_impl(test_case); -} - -// Custom FLOP calculator for quantized add operation -int64_t quantized_add_flop_calculator(const TestCase& test_case) { - // Calculate total elements from the first input tensor - int64_t total_elements = 1; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (int64_t size : sizes) { - total_elements *= size; - } - } - - // Quantized add operation includes: - // - 2 quantizations (float to int8) - // - 2 dequantizations (int8 to float) - // - 1 addition - // For simplicity, we count this as 1 FLOP per element (the addition) - return total_elements; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - // Execute test cases using the new framework with custom FLOP calculator - auto results = execute_test_cases( - generate_quantized_add_test_cases, - quantized_add_flop_calculator, - "QuantizedAddQ8taQ8taQ8to", - 0, - 1, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 63423ed410f..73b1e343bbe 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -93,7 +93,7 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q4gsw_linear") define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("test_q8ta_clone") + define_custom_op_test_binary("test_q8ta_binary") define_custom_op_test_binary("test_q8ta_conv2d") define_custom_op_test_binary("test_q8ta_conv2d_pw") define_custom_op_test_binary("test_q8ta_conv2d_dw") - define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp new file mode 100644 index 00000000000..1cb364c6f8d --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp @@ -0,0 +1,373 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +static constexpr int64_t kRefDimSizeLimit = 512; + +// Configuration struct for q8ta binary testing +struct Q8taBinaryConfig { + std::vector shape; // Tensor shape (can be any dimensionality) + std::string test_case_name = "placeholder"; + std::string op_name = "q8ta_add"; +}; + +// Utility function to create a test case from a Q8taBinaryConfig +TestCase create_test_case_from_config( + const Q8taBinaryConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype, + utils::GPUMemoryLayout fp_memory_layout, + utils::GPUMemoryLayout quant_layout) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string shape_str = shape_string(config.shape); + std::string test_name = config.test_case_name + " I=" + shape_str + " " + + repr_str(storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, quant_layout); + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "et_vk." + config.op_name + ".test"; + test_case.set_operator_name(operator_name); + + // Input tensor A (float/half) + ValueSpec input_a( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::RANDOM); + + // Input tensor B (float/half) + ValueSpec input_b( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::RANDOM); + + // Quantization parameters for input A + float input_a_scale_val = 0.007843; // 2/255 approximately + ValueSpec input_a_scale(input_a_scale_val); + + int32_t input_a_zero_point_val = 3; + ValueSpec input_a_zero_point(input_a_zero_point_val); + + // Quantization parameters for input B + float input_b_scale_val = 0.009412; // 2.4/255 approximately + ValueSpec input_b_scale(input_b_scale_val); + + int32_t input_b_zero_point_val = -2; + ValueSpec input_b_zero_point(input_b_zero_point_val); + + // Output quantization parameters + float output_scale_val = 0.015686; // 4/255 approximately + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = 1; + ValueSpec output_zero_point(output_zero_point_val); + + // Alpha parameter + float alpha_val = 1.0f; + ValueSpec alpha(alpha_val); + + // Quantized layout as integer + int32_t quant_layout_int = static_cast(quant_layout); + ValueSpec quant_layout_spec(quant_layout_int); + + // Output tensor (float/half) + ValueSpec output( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta add operation + test_case.add_input_spec(input_a); + test_case.add_input_spec(input_b); + test_case.add_input_spec(input_a_scale); + test_case.add_input_spec(input_a_zero_point); + test_case.add_input_spec(input_b_scale); + test_case.add_input_spec(input_b_zero_point); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(alpha); + test_case.add_input_spec(quant_layout_spec); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + // Use layout-only filter to focus on the binary operation + test_case.set_shader_filter({ + "nchw_to", + "to_nchw", + "q8ta_quantize", + "q8ta_dequantize", + }); + + return test_case; +} + +// Generate easy test cases for q8ta_add operation (for debugging) +std::vector generate_q8ta_add_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging + Q8taBinaryConfig config = { + {1, 16, 16, 16}, // shape: [N, C, H, W] + "ACCU", // test_case_name + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + std::vector storage_types = {utils::kBuffer}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back(create_test_case_from_config( + config, storage_type, input_dtype, fp_layout, quant_layout)); + } + } + } + } + + return test_cases; +} + +// Generate test cases for q8ta_add operation +std::vector generate_q8ta_add_test_cases() { + std::vector test_cases; + + // Shapes to test + std::vector> shapes = { + // Small test cases for correctness + {1, 3, 16, 16}, + {1, 8, 32, 32}, + {1, 16, 24, 24}, + {1, 32, 12, 12}, + {1, 1, 64, 64}, + {1, 3, 64, 64}, + {1, 4, 16, 16}, + + // Different tensor sizes + {1, 8, 20, 20}, + {1, 16, 14, 14}, + {1, 8, 28, 28}, + + // Odd tensor sizes + {1, 3, 15, 15}, + {1, 13, 31, 31}, + {1, 17, 23, 23}, + + // Performance test cases (larger tensors) + {1, 64, 128, 128}, + {1, 32, 64, 64}, + {1, 128, 56, 56}, + {1, 128, 128, 128}, + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + // Test with buffer storage only + std::vector storage_types = {utils::kBuffer}; + + // Generate all combinations + for (const auto& shape : shapes) { + // Generate test case name prefix from shape dimensions + std::string prefix = "ACCU"; + for (const auto& dim : shape) { + if (dim > kRefDimSizeLimit) { + prefix = "PERF"; + break; + } + } + + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + Q8taBinaryConfig config; + config.shape = shape; + config.test_case_name = prefix; + + test_cases.push_back(create_test_case_from_config( + config, storage_type, vkapi::kFloat, fp_layout, quant_layout)); + } + } + } + } + + return test_cases; +} + +// Reference implementation for quantized add operation +void q8ta_add_reference_impl(TestCase& test_case) { + int32_t idx = 0; + const ValueSpec& input_a_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& alpha_spec = test_case.inputs()[idx++]; + const ValueSpec& quant_layout_spec = test_case.inputs()[idx++]; + (void)quant_layout_spec; // Not used in reference implementation + + // Extract output specification + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_a_spec.get_tensor_sizes(); + + // Calculate total number of elements + int64_t num_elements = 1; + for (const auto& dim : input_sizes) { + num_elements *= dim; + } + + // Skip for large tensors since computation time will be extremely slow + for (const auto& dim : input_sizes) { + if (dim > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference " + "implementation."); + } + } + + if (input_a_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_a_data = input_a_spec.get_float_data(); + auto& input_b_data = input_b_spec.get_float_data(); + + const float input_a_scale = input_a_scale_spec.get_float_value(); + const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value(); + const float input_b_scale = input_b_scale_spec.get_float_value(); + const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value(); + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zero_point_spec.get_int_value(); + const float alpha = alpha_spec.get_float_value(); + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_elements); + + // Perform quantized add operation + for (int64_t i = 0; i < num_elements; ++i) { + // Quantize input A to int8 + float quant_a_f = + std::round(input_a_data[i] / input_a_scale) + input_a_zero_point; + quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f); + int8_t quantized_a = static_cast(quant_a_f); + + // Quantize input B to int8 + float quant_b_f = + std::round(input_b_data[i] / input_b_scale) + input_b_zero_point; + quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f); + int8_t quantized_b = static_cast(quant_b_f); + + // Dequantize both inputs to a common scale for addition + float dequant_a = + (static_cast(quantized_a) - input_a_zero_point) * input_a_scale; + float dequant_b = + (static_cast(quantized_b) - input_b_zero_point) * input_b_scale; + + // Perform addition in float space with alpha + float float_result = dequant_a + alpha * dequant_b; + + // Quantize the result to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + // Dequantize back to float for comparison + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + ref_data[i] = dequant_output; + } +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); +#ifdef DEBUG_MODE + set_print_latencies(false); +#else + set_print_latencies(false); +#endif + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Q8TA Binary Add Operation Prototyping Framework" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = q8ta_add_reference_impl; + + auto results = execute_test_cases( +#ifdef DEBUG_MODE + generate_q8ta_add_easy_cases, +#else + generate_q8ta_add_test_cases, +#endif + "Q8taBinaryAdd", +#ifdef DEBUG_MODE + 0, + 1, +#else + 3, + 10, +#endif + ref_fn); + + return 0; +}