diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py index ad65ae47a40..92bd9574dfc 100644 --- a/backends/vulkan/custom_ops_lib.py +++ b/backends/vulkan/custom_ops_lib.py @@ -564,11 +564,11 @@ def apply_rotary_emb_impl( apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name) ######################## -## add_q8ta_q8ta_q8to ## +## q8ta_add ## ######################## -def add_q8ta_q8ta_q8to_impl( +def q8ta_add_impl( input_a: torch.Tensor, input_b: torch.Tensor, input_a_scale: float, @@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl( return quantized_result -name = "add_q8ta_q8ta_q8to" +name = "q8ta_add" lib.define( f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor" ) -lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd") -add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name) +lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd") +q8ta_add_op = getattr(getattr(torch.ops, namespace), name) ############################# ## select_as_symint ## diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 3a8f7624bbb..0ad9c980ed6 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -501,14 +501,14 @@ def register_torchao_choose_qparams_affine(): # ============================================================================= -# QuantizedBinary.cpp +# Q8taBinary.cpp # ============================================================================= -@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default) -def register_add_q8ta_q8ta_q8to(): +@update_features(exir_ops.edge.et_vk.q8ta_add.default) +def register_q8ta_add(): return OpFeatures( - inputs_storage=utils.PACKED_INT8_4W4C_BUFFER, + inputs_storage=utils.PACKED_INT8_BUFFER, supports_resize=False, supports_prepacking=True, ) diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py index da4985b931d..9a18f148736 100644 --- a/backends/vulkan/patterns/quantized_binary.py +++ b/backends/vulkan/patterns/quantized_binary.py @@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op( exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.add_.Tensor, }: - op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default + op_target = exir_ops.edge.et_vk.q8ta_add.default else: # For future binary operations, add more mappings here raise NotImplementedError( diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl deleted file mode 100644 index c5dac0d6571..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -${define_required_extensions("buffer", DTYPE)} - -#define PRECISION ${PRECISION} - -#define NAME ${VARIANT_NAME} - -#define VEC4_T ${texel_load_type(DTYPE, "buffer")} -#define T ${texel_load_component_type(DTYPE, "buffer")} - -$if IO_STORAGE == "buffer": - #define PACKED_INT8_OUTPUT_BUFFER - #define PACKED_INT8_INPUT_BUFFER - -#define op(X, Y) ${OPERATOR} - -layout(std430) buffer; - -#include "indexing.glslh" -#include "common.glslh" - -${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)} -${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)} - -${layout_declare_ubo(B, "ivec4", "out_sizes")} - -layout(push_constant) uniform restrict Block { - float input_a_scale; - int input_a_zp; - float input_b_scale; - int input_b_zp; - float output_inv_scale; - int output_zp; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int tid = int(gl_GlobalInvocationID.x); - - const int W4 = div_up_4(out_sizes.x); - const int H = out_sizes.y; - const int C4 = div_up_4(out_sizes.z); - const int N = out_sizes.w; - - if (tid >= W4 * H * C4 * N) { - return; - } - - const ivec4 in_block_1 = t_packed_int8_in_a[tid]; - const ivec4 in_block_2 = t_packed_int8_in_b[tid]; - - ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp))); - - for (int row = 0; row < 4; row++) { - vec4 in_texel_1 = unpack_and_dequantize( - in_block_1[row], input_a_scale, input_a_zp); - vec4 in_texel_2 = unpack_and_dequantize( - in_block_2[row], input_b_scale, input_b_zp); - - vec4 out_texel = op(in_texel_1, in_texel_2); - out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp); - } - - t_packed_int8_out[tid] = out_block; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl new file mode 100644 index 00000000000..60f437fbdce --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +${define_active_storage_type("buffer")} + +#define op(X, Y) ${OPERATOR} + +layout(std430) buffer; + +#include "indexing.glslh" +#include "common.glslh" +#include "block_indexing.glslh" +#include "block_int8x4_load.glslh" +#include "block_int8x4_store.glslh" + +// Output buffer: packed int8x4 values +${layout_declare_tensor(B, "w", "t_out", "int", "buffer")} +// Input buffers: packed int8x4 values +${layout_declare_tensor(B, "r", "t_in_a", "int", "buffer")} +${layout_declare_tensor(B, "r", "t_in_b", "int", "buffer")} + +// Metadata for output and input tensors +${layout_declare_ubo(B, "BufferMetadata", "out_meta")} +${layout_declare_ubo(B, "BufferMetadata", "in_a_meta")} +${layout_declare_ubo(B, "BufferMetadata", "in_b_meta")} + +layout(push_constant) uniform restrict Block { + float input_a_scale; + int input_a_zp; + float input_b_scale; + int input_b_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "block_config", "0")} + +// Generate loading functions for input buffers +define_load_int8x4_buffer_fns(t_in_a) +define_load_int8x4_buffer_fns(t_in_b) + +// Generate storing functions for output buffer +define_store_int8x4_buffer_fns(t_out) + +void main() { + // Buffer storage: use linear dispatch + const uint contig_block_idx = gl_GlobalInvocationID.x; + TensorIndex4D tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config( + out_meta, contig_block_idx, block_config); + + if (out_of_bounds(tidx, out_meta)) { + return; + } + + const int block_outer_dim = get_block_outer_dim(block_config); + + // Load int8x4 blocks from both inputs + ivec4 in_block_a = load_int8x4_block_from_t_in_a( + in_a_meta, tidx, in_layout, block_outer_dim); + ivec4 in_block_b = load_int8x4_block_from_t_in_b( + in_b_meta, tidx, in_layout, block_outer_dim); + + ivec4 out_block; + + for (int row = 0; row < 4; row++) { + vec4 in_texel_a = unpack_and_dequantize( + in_block_a[row], input_a_scale, input_a_zp); + vec4 in_texel_b = unpack_and_dequantize( + in_block_b[row], input_b_scale, input_b_zp); + + vec4 out_texel = op(in_texel_a, in_texel_b); + out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp); + } + + // Store to output buffer + store_int8x4_block_to_t_out( + out_meta, tidx, out_layout, block_outer_dim, out_block); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml similarity index 61% rename from backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml rename to backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml index e19ed8839eb..2060f7e42ba 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml @@ -4,16 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -binary_q8ta_q8ta_q8to: +q8ta_binary: parameter_names_with_default_values: OPERATOR: X + Y - NDIM: 3 - DTYPE: float - PACKING: C_packed - IO_STORAGE: buffer - generate_variant_forall: - IO_STORAGE: - - VALUE: buffer shader_variants: - - NAME: add_q8ta_q8ta_q8to + - NAME: q8ta_add_buffer OPERATOR: X + Y diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp similarity index 53% rename from backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp rename to backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp index 99b5880c2eb..c7030b64ee4 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp @@ -9,38 +9,15 @@ #include #include -#include -#include #include namespace vkcompute { -// -// Shader dispatch utilities -// - -utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size( - ComputeGraph* graph, - const vkapi::ShaderInfo& shader, - const std::vector& args, - const std::vector& resize_args) { - const ValueRef packed_int8_output = args.at(0).refs.at(0); - - const uint32_t W = graph->size_at(-1, packed_int8_output); - const uint32_t H = graph->size_at(-2, packed_int8_output); - const uint32_t C = graph->size_at(-3, packed_int8_output); - - const uint32_t W4 = utils::div_up_4(W); - const uint32_t C4 = utils::div_up_4(C); - - return {W4 * H * C4, 1, 1}; -} - // // Dispatch nodes // -void add_q8ta_q8ta_q8to_binary_node( +void add_q8ta_binary_node( ComputeGraph& graph, const ValueRef packed_int8_input_a, const ValueRef packed_int8_input_b, @@ -68,11 +45,15 @@ void add_q8ta_q8ta_q8to_binary_node( alpha_val = graph.extract_scalar(alpha); } - std::string kernel_name = op_name + "_q8ta_q8ta_q8to"; + std::string kernel_name = "q8ta_" + op_name; add_storage_type_suffix( kernel_name, graph.storage_type_of(packed_int8_output)); - vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)}; + // Pass metadata for output and input tensors + vkapi::ParamsBindList param_buffers; + param_buffers.append(graph.buffer_meta_ubo(packed_int8_output)); + param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_a)); + param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_b)); std::vector push_constants = { PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)), @@ -84,11 +65,19 @@ void add_q8ta_q8ta_q8to_binary_node( PushConstantDataInfo(&alpha_val, sizeof(alpha_val)), }; + // Create block config for output tensor: inner_dim = output's packed_dim + const BlockConfig block_config = + create_block_config_for_tensor(graph, packed_int8_output); + + // Cast block config to ValueRef for pick_linear_global_wg_with_block_config + const ValueRef block_config_ref = + static_cast(block_config.as_packed_int()); + graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), - pick_q8ta_q8ta_q8to_binary_global_wg_size, - default_pick_local_wg_size, + pick_linear_global_wg_with_block_config, + pick_square_local_wg_with_block_config, // Inputs and Outputs {{packed_int8_output, vkapi::kWrite}, {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}}, @@ -97,9 +86,11 @@ void add_q8ta_q8ta_q8to_binary_node( // Push Constants push_constants, // Specialization Constants - {}, + {graph.hashed_layout_of(packed_int8_output), + graph.hashed_layout_of(packed_int8_input_a), + block_config.as_packed_int()}, // Resize args - {}, + {block_config_ref}, // Resizing Logic nullptr)); } @@ -108,9 +99,7 @@ void add_q8ta_q8ta_q8to_binary_node( // High level operator impl // -void add_q8ta_q8ta_q8to( - ComputeGraph& graph, - const std::vector& args) { +void q8ta_add(ComputeGraph& graph, const std::vector& args) { int32_t idx = 0; const ValueRef packed_int8_input_a = args.at(idx++); const ValueRef packed_int8_input_b = args.at(idx++); @@ -123,7 +112,7 @@ void add_q8ta_q8ta_q8to( const ValueRef alpha = args.at(idx++); const ValueRef packed_int8_output = args.at(idx++); - add_q8ta_q8ta_q8to_binary_node( + add_q8ta_binary_node( graph, packed_int8_input_a, packed_int8_input_b, @@ -138,73 +127,8 @@ void add_q8ta_q8ta_q8to( "add"); } -// -// Test operators -// - -void add_q8ta_q8ta_q8to_test( - ComputeGraph& graph, - const std::vector& args) { - int32_t idx = 0; - const ValueRef fp_input_a = args.at(idx++); - const ValueRef fp_input_b = args.at(idx++); - const ValueRef input_a_scale = args.at(idx++); - const ValueRef input_a_zp = args.at(idx++); - const ValueRef input_b_scale = args.at(idx++); - const ValueRef input_b_zp = args.at(idx++); - const ValueRef output_scale = args.at(idx++); - const ValueRef output_zp = args.at(idx++); - const ValueRef alpha = args.at(idx++); - const ValueRef fp_output = args.at(idx++); - - TmpTensor packed_int8_input_a( - &graph, - graph.sizes_of(fp_input_a), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - TmpTensor packed_int8_input_b( - &graph, - graph.sizes_of(fp_input_b), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - TmpTensor packed_int8_output( - &graph, - graph.sizes_of(fp_output), - vkapi::kInt8x4, - utils::kBuffer, - utils::kPackedInt8_4W4C); - - add_quantize_and_pack_4w4c_node( - graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a); - - add_quantize_and_pack_4w4c_node( - graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b); - - std::vector add_args = { - packed_int8_input_a, - packed_int8_input_b, - input_a_scale, - input_a_zp, - input_b_scale, - input_b_zp, - output_scale, - output_zp, - alpha, - packed_int8_output}; - - add_q8ta_q8ta_q8to(graph, add_args); - - add_unpack_4w4c_and_dequantize_node( - graph, packed_int8_output, output_scale, output_zp, fp_output); -} - REGISTER_OPERATORS { - VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to); - VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test); + VK_REGISTER_OP(et_vk.q8ta_add.default, q8ta_add); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h new file mode 100644 index 00000000000..512849762cb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +// +// Binary operations for int8x4 tensors +// + +void add_q8ta_binary_node( + ComputeGraph& graph, + const ValueRef packed_int8_input_a, + const ValueRef packed_int8_input_b, + const ValueRef input_a_scale, + const ValueRef input_a_zp, + const ValueRef input_b_scale, + const ValueRef input_b_zp, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef alpha, + const ValueRef packed_int8_output, + const std::string& op_name); + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 0121c84bb5b..0777ad76a8d 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -99,8 +99,8 @@ if(TARGET vulkan_backend) add_operator_prototype(choose_qparams_per_row) add_operator_prototype(test_q8ta_qdq) add_operator_prototype(test_q8ta_clone) + add_operator_prototype(test_q8ta_add) add_operator_prototype(test_q8ta_conv2d) add_operator_prototype(test_q8ta_conv2d_pw) add_operator_prototype(test_q8ta_conv2d_dw) - add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp new file mode 100644 index 00000000000..53f8859b581 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +void q8ta_add_test(ComputeGraph& graph, const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input_a = args.at(idx++); + const ValueRef fp_input_b = args.at(idx++); + const ValueRef input_a_scale = args.at(idx++); + const ValueRef input_a_zp = args.at(idx++); + const ValueRef input_b_scale = args.at(idx++); + const ValueRef input_b_zp = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef alpha = args.at(idx++); + const ValueRef quant_layout_int = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(quant_layout_int); + utils::GPUMemoryLayout quant_layout = + static_cast(layout_value); + + // Create temporary tensors for quantized data with the specified layout + TmpTensor packed_int8_input_a( + &graph, + graph.sizes_of(fp_input_a), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + TmpTensor packed_int8_input_b( + &graph, + graph.sizes_of(fp_input_b), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + quant_layout); + + // Quantize: FP -> int8x4 with specified layout + add_q8ta_quantize_node( + graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a); + + add_q8ta_quantize_node( + graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b); + + // Binary add: int8x4 -> int8x4 (same layout for all tensors) + add_q8ta_binary_node( + graph, + packed_int8_input_a, + packed_int8_input_b, + input_a_scale, + input_a_zp, + input_b_scale, + input_b_zp, + output_scale, + output_zp, + alpha, + packed_int8_output, + "add"); + + // Dequantize: int8x4 -> FP + add_q8ta_dequantize_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.q8ta_add.test, q8ta_add_test); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp deleted file mode 100644 index eb8e6908060..00000000000 --- a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include "utils.h" - -using namespace executorch::vulkan::prototyping; - -// Utility function to create a test case for quantized add operation -TestCase create_quantized_add_test_case( - const std::vector& sizes, - utils::StorageType storage_type, - vkapi::ScalarType input_dtype) { - TestCase test_case; - - // Create a descriptive name for the test case - std::string size_str = ""; - for (size_t i = 0; i < sizes.size(); ++i) { - size_str += std::to_string(sizes[i]); - if (i < sizes.size() - 1) - size_str += "x"; - } - - std::string storage_str = - (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer"; - std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half"; - - std::string test_name = - "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str; - test_case.set_name(test_name); - - // Set the operator name for the test case - test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test"); - - utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer - ? utils::kWidthPacked - : utils::kChannelsPacked; - - // Input tensor A (float/half) - ValueSpec input_a( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM); - - // Input tensor B (float/half) - ValueSpec input_b( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM); - - // Quantization parameters for input A - float input_a_scale_val = 0.007843; // 2/255 approximately - ValueSpec input_a_scale(input_a_scale_val); - - int32_t input_a_zero_point_val = 3; - ValueSpec input_a_zero_point(input_a_zero_point_val); - - // Quantization parameters for input B - float input_b_scale_val = 0.009412; // 2.4/255 approximately - ValueSpec input_b_scale(input_b_scale_val); - - int32_t input_b_zero_point_val = -2; - ValueSpec input_b_zero_point(input_b_zero_point_val); - - // Output quantization parameters - float output_scale_val = 0.015686; // 4/255 approximately - ValueSpec output_scale(output_scale_val); - - int32_t output_zero_point_val = 1; - ValueSpec output_zero_point(output_zero_point_val); - - // Alpha parameter - float alpha_val = 1.0f; - ValueSpec alpha(alpha_val); - - // Output tensor (float/half) - ValueSpec output( - sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS); - - // Add all specs to test case for q8ta_q8ta_q8to add operation - test_case.add_input_spec(input_a); - test_case.add_input_spec(input_b); - test_case.add_input_spec(input_a_scale); - test_case.add_input_spec(input_a_zero_point); - test_case.add_input_spec(input_b_scale); - test_case.add_input_spec(input_b_zero_point); - test_case.add_input_spec(output_scale); - test_case.add_input_spec(output_zero_point); - test_case.add_input_spec(alpha); - - test_case.add_output_spec(output); - - test_case.set_abs_tolerance(output_scale_val + 1e-4f); - - return test_case; -} - -// Generate test cases for quantized add operation -std::vector generate_quantized_add_test_cases() { - std::vector test_cases; - - // Define different input size configurations - std::vector> size_configs = { - {3, 32, 32}, // Small square - {8, 64, 64}, // Medium square - {16, 16, 16}, // 3D cube - {8, 32, 16}, // 3D rectangular - {7, 7, 13}, // Irregular sizes - }; - - // Storage types to test - std::vector storage_types = { - utils::kTexture3D, utils::kBuffer}; - - // Data types to test - std::vector data_types = {vkapi::kFloat}; - - // Generate test cases for each combination - for (const auto& sizes : size_configs) { - for (const auto& storage_type : storage_types) { - for (const auto& data_type : data_types) { - test_cases.push_back( - create_quantized_add_test_case(sizes, storage_type, data_type)); - } - } - } - - return test_cases; -} - -// Reference implementation for quantized add operation -void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) { - // Extract input specifications - int32_t idx = 0; - const ValueSpec& input_a_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_spec = test_case.inputs()[idx++]; - const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; - const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++]; - const ValueSpec& alpha_spec = test_case.inputs()[idx++]; - - // Extract output specification (mutable reference) - ValueSpec& output_spec = test_case.outputs()[0]; - - // Get tensor dimensions - auto input_sizes = input_a_spec.get_tensor_sizes(); - int64_t num_elements = input_a_spec.numel(); - - if (input_a_spec.dtype != vkapi::kFloat) { - throw std::invalid_argument("Unsupported dtype"); - } - - // Get raw data pointers - auto& input_a_data = input_a_spec.get_float_data(); - auto& input_b_data = input_b_spec.get_float_data(); - - const float input_a_scale = input_a_scale_spec.get_float_value(); - const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value(); - const float input_b_scale = input_b_scale_spec.get_float_value(); - const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value(); - const float output_scale = output_scale_spec.get_float_value(); - const int32_t output_zero_point = output_zero_point_spec.get_int_value(); - const float alpha = alpha_spec.get_float_value(); - - auto& ref_data = output_spec.get_ref_float_data(); - ref_data.resize(num_elements); - - // Perform quantized add operation - for (int64_t i = 0; i < num_elements; ++i) { - // Quantize input A to int8 - float quant_a_f = - std::round(input_a_data[i] / input_a_scale) + input_a_zero_point; - quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f); - int8_t quantized_a = static_cast(quant_a_f); - - // Quantize input B to int8 - float quant_b_f = - std::round(input_b_data[i] / input_b_scale) + input_b_zero_point; - quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f); - int8_t quantized_b = static_cast(quant_b_f); - - // Dequantize both inputs to a common scale for addition - float dequant_a = - (static_cast(quantized_a) - input_a_zero_point) * input_a_scale; - float dequant_b = - (static_cast(quantized_b) - input_b_zero_point) * input_b_scale; - - // Perform addition in float space with alpha - float float_result = dequant_a + alpha * dequant_b; - - // Quantize the result to int8 - float quant_output_f = - std::round(float_result / output_scale) + output_zero_point; - quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); - int8_t quantized_output = static_cast(quant_output_f); - - // Dequantize back to float for comparison - float dequant_output = - (static_cast(quantized_output) - output_zero_point) * - output_scale; - - ref_data[i] = dequant_output; - } -} - -void reference_impl(TestCase& test_case) { - add_q8ta_q8ta_q8to_reference_impl(test_case); -} - -// Custom FLOP calculator for quantized add operation -int64_t quantized_add_flop_calculator(const TestCase& test_case) { - // Calculate total elements from the first input tensor - int64_t total_elements = 1; - if (!test_case.empty() && test_case.num_inputs() > 0 && - test_case.inputs()[0].is_tensor()) { - const auto& sizes = test_case.inputs()[0].get_tensor_sizes(); - for (int64_t size : sizes) { - total_elements *= size; - } - } - - // Quantized add operation includes: - // - 2 quantizations (float to int8) - // - 2 dequantizations (int8 to float) - // - 1 addition - // For simplicity, we count this as 1 FLOP per element (the addition) - return total_elements; -} - -int main(int argc, char* argv[]) { - set_debugging(false); - set_print_output(false); - set_print_latencies(false); - set_use_gpu_timestamps(true); - - print_performance_header(); - std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework" - << std::endl; - print_separator(); - - ReferenceComputeFunc ref_fn = reference_impl; - - // Execute test cases using the new framework with custom FLOP calculator - auto results = execute_test_cases( - generate_quantized_add_test_cases, - quantized_add_flop_calculator, - "QuantizedAddQ8taQ8taQ8to", - 0, - 1, - ref_fn); - - return 0; -} diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 63423ed410f..73b1e343bbe 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -93,7 +93,7 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q4gsw_linear") define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("test_q8ta_clone") + define_custom_op_test_binary("test_q8ta_binary") define_custom_op_test_binary("test_q8ta_conv2d") define_custom_op_test_binary("test_q8ta_conv2d_pw") define_custom_op_test_binary("test_q8ta_conv2d_dw") - define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp new file mode 100644 index 00000000000..1cb364c6f8d --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp @@ -0,0 +1,373 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +static constexpr int64_t kRefDimSizeLimit = 512; + +// Configuration struct for q8ta binary testing +struct Q8taBinaryConfig { + std::vector shape; // Tensor shape (can be any dimensionality) + std::string test_case_name = "placeholder"; + std::string op_name = "q8ta_add"; +}; + +// Utility function to create a test case from a Q8taBinaryConfig +TestCase create_test_case_from_config( + const Q8taBinaryConfig& config, + utils::StorageType storage_type, + vkapi::ScalarType input_dtype, + utils::GPUMemoryLayout fp_memory_layout, + utils::GPUMemoryLayout quant_layout) { + TestCase test_case; + + // Create a descriptive name for the test case + std::string shape_str = shape_string(config.shape); + std::string test_name = config.test_case_name + " I=" + shape_str + " " + + repr_str(storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, quant_layout); + test_case.set_name(test_name); + + // Set the operator name for the test case + std::string operator_name = "et_vk." + config.op_name + ".test"; + test_case.set_operator_name(operator_name); + + // Input tensor A (float/half) + ValueSpec input_a( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::RANDOM); + + // Input tensor B (float/half) + ValueSpec input_b( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::RANDOM); + + // Quantization parameters for input A + float input_a_scale_val = 0.007843; // 2/255 approximately + ValueSpec input_a_scale(input_a_scale_val); + + int32_t input_a_zero_point_val = 3; + ValueSpec input_a_zero_point(input_a_zero_point_val); + + // Quantization parameters for input B + float input_b_scale_val = 0.009412; // 2.4/255 approximately + ValueSpec input_b_scale(input_b_scale_val); + + int32_t input_b_zero_point_val = -2; + ValueSpec input_b_zero_point(input_b_zero_point_val); + + // Output quantization parameters + float output_scale_val = 0.015686; // 4/255 approximately + ValueSpec output_scale(output_scale_val); + + int32_t output_zero_point_val = 1; + ValueSpec output_zero_point(output_zero_point_val); + + // Alpha parameter + float alpha_val = 1.0f; + ValueSpec alpha(alpha_val); + + // Quantized layout as integer + int32_t quant_layout_int = static_cast(quant_layout); + ValueSpec quant_layout_spec(quant_layout_int); + + // Output tensor (float/half) + ValueSpec output( + config.shape, + input_dtype, + storage_type, + fp_memory_layout, + DataGenType::ZEROS); + + // Add all specs to test case for q8ta add operation + test_case.add_input_spec(input_a); + test_case.add_input_spec(input_b); + test_case.add_input_spec(input_a_scale); + test_case.add_input_spec(input_a_zero_point); + test_case.add_input_spec(input_b_scale); + test_case.add_input_spec(input_b_zero_point); + test_case.add_input_spec(output_scale); + test_case.add_input_spec(output_zero_point); + test_case.add_input_spec(alpha); + test_case.add_input_spec(quant_layout_spec); + + test_case.add_output_spec(output); + + test_case.set_abs_tolerance(output_scale_val + 1e-4f); + + // Use layout-only filter to focus on the binary operation + test_case.set_shader_filter({ + "nchw_to", + "to_nchw", + "q8ta_quantize", + "q8ta_dequantize", + }); + + return test_case; +} + +// Generate easy test cases for q8ta_add operation (for debugging) +std::vector generate_q8ta_add_easy_cases() { + std::vector test_cases; + + // Single simple configuration for debugging + Q8taBinaryConfig config = { + {1, 16, 16, 16}, // shape: [N, C, H, W] + "ACCU", // test_case_name + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + std::vector storage_types = {utils::kBuffer}; + std::vector float_types = {vkapi::kFloat}; + + // Generate test cases for each combination + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + for (const auto& input_dtype : float_types) { + test_cases.push_back(create_test_case_from_config( + config, storage_type, input_dtype, fp_layout, quant_layout)); + } + } + } + } + + return test_cases; +} + +// Generate test cases for q8ta_add operation +std::vector generate_q8ta_add_test_cases() { + std::vector test_cases; + + // Shapes to test + std::vector> shapes = { + // Small test cases for correctness + {1, 3, 16, 16}, + {1, 8, 32, 32}, + {1, 16, 24, 24}, + {1, 32, 12, 12}, + {1, 1, 64, 64}, + {1, 3, 64, 64}, + {1, 4, 16, 16}, + + // Different tensor sizes + {1, 8, 20, 20}, + {1, 16, 14, 14}, + {1, 8, 28, 28}, + + // Odd tensor sizes + {1, 3, 15, 15}, + {1, 13, 31, 31}, + {1, 17, 23, 23}, + + // Performance test cases (larger tensors) + {1, 64, 128, 128}, + {1, 32, 64, 64}, + {1, 128, 56, 56}, + {1, 128, 128, 128}, + }; + + // FP memory layouts to test + std::vector fp_layouts = { + utils::kWidthPacked, + utils::kChannelsPacked, + }; + + // Quantized memory layouts to test + std::vector quant_layouts = { + utils::kPackedInt8_4W, + utils::kPackedInt8_4C, + utils::kPackedInt8_4W4C, + utils::kPackedInt8_4H4W, + utils::kPackedInt8_4C1W, + }; + + // Test with buffer storage only + std::vector storage_types = {utils::kBuffer}; + + // Generate all combinations + for (const auto& shape : shapes) { + // Generate test case name prefix from shape dimensions + std::string prefix = "ACCU"; + for (const auto& dim : shape) { + if (dim > kRefDimSizeLimit) { + prefix = "PERF"; + break; + } + } + + for (const auto& fp_layout : fp_layouts) { + for (const auto& quant_layout : quant_layouts) { + for (const auto& storage_type : storage_types) { + Q8taBinaryConfig config; + config.shape = shape; + config.test_case_name = prefix; + + test_cases.push_back(create_test_case_from_config( + config, storage_type, vkapi::kFloat, fp_layout, quant_layout)); + } + } + } + } + + return test_cases; +} + +// Reference implementation for quantized add operation +void q8ta_add_reference_impl(TestCase& test_case) { + int32_t idx = 0; + const ValueSpec& input_a_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& output_scale_spec = test_case.inputs()[idx++]; + const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++]; + const ValueSpec& alpha_spec = test_case.inputs()[idx++]; + const ValueSpec& quant_layout_spec = test_case.inputs()[idx++]; + (void)quant_layout_spec; // Not used in reference implementation + + // Extract output specification + ValueSpec& output_spec = test_case.outputs()[0]; + + // Get tensor dimensions + auto input_sizes = input_a_spec.get_tensor_sizes(); + + // Calculate total number of elements + int64_t num_elements = 1; + for (const auto& dim : input_sizes) { + num_elements *= dim; + } + + // Skip for large tensors since computation time will be extremely slow + for (const auto& dim : input_sizes) { + if (dim > kRefDimSizeLimit) { + throw std::invalid_argument( + "One or more dimensions exceed the allowed limit for reference " + "implementation."); + } + } + + if (input_a_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Unsupported dtype"); + } + + // Get raw data pointers + auto& input_a_data = input_a_spec.get_float_data(); + auto& input_b_data = input_b_spec.get_float_data(); + + const float input_a_scale = input_a_scale_spec.get_float_value(); + const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value(); + const float input_b_scale = input_b_scale_spec.get_float_value(); + const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value(); + const float output_scale = output_scale_spec.get_float_value(); + const int32_t output_zero_point = output_zero_point_spec.get_int_value(); + const float alpha = alpha_spec.get_float_value(); + + auto& ref_data = output_spec.get_ref_float_data(); + ref_data.resize(num_elements); + + // Perform quantized add operation + for (int64_t i = 0; i < num_elements; ++i) { + // Quantize input A to int8 + float quant_a_f = + std::round(input_a_data[i] / input_a_scale) + input_a_zero_point; + quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f); + int8_t quantized_a = static_cast(quant_a_f); + + // Quantize input B to int8 + float quant_b_f = + std::round(input_b_data[i] / input_b_scale) + input_b_zero_point; + quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f); + int8_t quantized_b = static_cast(quant_b_f); + + // Dequantize both inputs to a common scale for addition + float dequant_a = + (static_cast(quantized_a) - input_a_zero_point) * input_a_scale; + float dequant_b = + (static_cast(quantized_b) - input_b_zero_point) * input_b_scale; + + // Perform addition in float space with alpha + float float_result = dequant_a + alpha * dequant_b; + + // Quantize the result to int8 + float quant_output_f = + std::round(float_result / output_scale) + output_zero_point; + quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f); + int8_t quantized_output = static_cast(quant_output_f); + + // Dequantize back to float for comparison + float dequant_output = + (static_cast(quantized_output) - output_zero_point) * + output_scale; + + ref_data[i] = dequant_output; + } +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); +#ifdef DEBUG_MODE + set_print_latencies(false); +#else + set_print_latencies(false); +#endif + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Q8TA Binary Add Operation Prototyping Framework" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = q8ta_add_reference_impl; + + auto results = execute_test_cases( +#ifdef DEBUG_MODE + generate_q8ta_add_easy_cases, +#else + generate_q8ta_add_test_cases, +#endif + "Q8taBinaryAdd", +#ifdef DEBUG_MODE + 0, + 1, +#else + 3, + 10, +#endif + ref_fn); + + return 0; +}