diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index ad65ae47a40..92bd9574dfc 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -564,11 +564,11 @@ def apply_rotary_emb_impl(
 apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
 
 ########################
-## add_q8ta_q8ta_q8to ##
+## q8ta_add ##
 ########################
 
 
-def add_q8ta_q8ta_q8to_impl(
+def q8ta_add_impl(
     input_a: torch.Tensor,
     input_b: torch.Tensor,
     input_a_scale: float,
@@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl(
     return quantized_result
 
 
-name = "add_q8ta_q8ta_q8to"
+name = "q8ta_add"
 lib.define(
     f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
 )
-lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
-add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
+lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd")
+q8ta_add_op = getattr(getattr(torch.ops, namespace), name)
 
 #############################
 ## select_as_symint ##
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 3a8f7624bbb..0ad9c980ed6 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -501,14 +501,14 @@ def register_torchao_choose_qparams_affine():
 
 
 # =============================================================================
-# QuantizedBinary.cpp
+# Q8taBinary.cpp
 # =============================================================================
 
 
-@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default)
-def register_add_q8ta_q8ta_q8to():
+@update_features(exir_ops.edge.et_vk.q8ta_add.default)
+def register_q8ta_add():
     return OpFeatures(
-        inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
+        inputs_storage=utils.PACKED_INT8_BUFFER,
         supports_resize=False,
         supports_prepacking=True,
     )
diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py
index da4985b931d..9a18f148736 100644
--- a/backends/vulkan/patterns/quantized_binary.py
+++ b/backends/vulkan/patterns/quantized_binary.py
@@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op(
         exir_ops.edge.aten.add.Tensor,
         exir_ops.edge.aten.add_.Tensor,
     }:
-        op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
+        op_target = exir_ops.edge.et_vk.q8ta_add.default
     else:
         # For future binary operations, add more mappings here
         raise NotImplementedError(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
deleted file mode 100644
index c5dac0d6571..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-${define_required_extensions("buffer", DTYPE)}
-
-#define PRECISION ${PRECISION}
-
-#define NAME ${VARIANT_NAME}
-
-#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
-#define T ${texel_load_component_type(DTYPE, "buffer")}
-
-$if IO_STORAGE == "buffer":
-  #define PACKED_INT8_OUTPUT_BUFFER
-  #define PACKED_INT8_INPUT_BUFFER
-
-#define op(X, Y) ${OPERATOR}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-#include "common.glslh"
-
-${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-
-layout(push_constant) uniform restrict Block {
-  float input_a_scale;
-  int input_a_zp;
-  float input_b_scale;
-  int input_b_zp;
-  float output_inv_scale;
-  int output_zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int tid = int(gl_GlobalInvocationID.x);
-
-  const int W4 = div_up_4(out_sizes.x);
-  const int H = out_sizes.y;
-  const int C4 = div_up_4(out_sizes.z);
-  const int N = out_sizes.w;
-
-  if (tid >= W4 * H * C4 * N) {
-    return;
-  }
-
-  const ivec4 in_block_1 = t_packed_int8_in_a[tid];
-  const ivec4 in_block_2 = t_packed_int8_in_b[tid];
-
-  ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp)));
-
-  for (int row = 0; row < 4; row++) {
-    vec4 in_texel_1 = unpack_and_dequantize(
-        in_block_1[row], input_a_scale, input_a_zp);
-    vec4 in_texel_2 = unpack_and_dequantize(
-        in_block_2[row], input_b_scale, input_b_zp);
-
-    vec4 out_texel = op(in_texel_1, in_texel_2);
-    out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp);
-  }
-
-  t_packed_int8_out[tid] = out_block;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl
new file mode 100644
index 00000000000..60f437fbdce
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.glsl
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type("buffer")}
+
+#define op(X, Y) ${OPERATOR}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "common.glslh"
+#include "block_indexing.glslh"
+#include "block_int8x4_load.glslh"
+#include "block_int8x4_store.glslh"
+
+// Output buffer: packed int8x4 values
+${layout_declare_tensor(B, "w", "t_out", "int", "buffer")}
+// Input buffers: packed int8x4 values
+${layout_declare_tensor(B, "r", "t_in_a", "int", "buffer")}
+${layout_declare_tensor(B, "r", "t_in_b", "int", "buffer")}
+
+// Metadata for output and input tensors
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_a_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_b_meta")}
+
+layout(push_constant) uniform restrict Block {
+  float input_a_scale;
+  int input_a_zp;
+  float input_b_scale;
+  int input_b_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "block_config", "0")}
+
+// Generate loading functions for input buffers
+define_load_int8x4_buffer_fns(t_in_a)
+define_load_int8x4_buffer_fns(t_in_b)
+
+// Generate storing functions for output buffer
+define_store_int8x4_buffer_fns(t_out)
+
+void main() {
+  // Buffer storage: use linear dispatch
+  const uint contig_block_idx = gl_GlobalInvocationID.x;
+  TensorIndex4D tidx = contiguous_block_idx_to_tensor4d_idx_with_block_config(
+      out_meta, contig_block_idx, block_config);
+
+  if (out_of_bounds(tidx, out_meta)) {
+    return;
+  }
+
+  const int block_outer_dim = get_block_outer_dim(block_config);
+
+  // Load int8x4 blocks from both inputs
+  ivec4 in_block_a = load_int8x4_block_from_t_in_a(
+      in_a_meta, tidx, in_layout, block_outer_dim);
+  ivec4 in_block_b = load_int8x4_block_from_t_in_b(
+      in_b_meta, tidx, in_layout, block_outer_dim);
+
+  ivec4 out_block;
+
+  for (int row = 0; row < 4; row++) {
+    vec4 in_texel_a = unpack_and_dequantize(
+        in_block_a[row], input_a_scale, input_a_zp);
+    vec4 in_texel_b = unpack_and_dequantize(
+        in_block_b[row], input_b_scale, input_b_zp);
+
+    vec4 out_texel = op(in_texel_a, in_texel_b);
+    out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp);
+  }
+
+  // Store to output buffer
+  store_int8x4_block_to_t_out(
+      out_meta, tidx, out_layout, block_outer_dim, out_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
similarity index 61%
rename from backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
index e19ed8839eb..2060f7e42ba 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_binary.yaml
@@ -4,16 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-binary_q8ta_q8ta_q8to:
+q8ta_binary:
   parameter_names_with_default_values:
     OPERATOR: X + Y
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    IO_STORAGE: buffer
-  generate_variant_forall:
-    IO_STORAGE:
-      - VALUE: buffer
   shader_variants:
-    - NAME: add_q8ta_q8ta_q8to
+    - NAME: q8ta_add_buffer
       OPERATOR: X + Y
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
similarity index 53%
rename from backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
rename to backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
index 99b5880c2eb..c7030b64ee4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp
@@ -9,38 +9,15 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-//
-// Shader dispatch utilities
-//
-
-utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef packed_int8_output = args.at(0).refs.at(0);
-
-  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
-  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
-  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
-
-  const uint32_t W4 = utils::div_up_4(W);
-  const uint32_t C4 = utils::div_up_4(C);
-
-  return {W4 * H * C4, 1, 1};
-}
-
 //
 // Dispatch nodes
 //
 
-void add_q8ta_q8ta_q8to_binary_node(
+void add_q8ta_binary_node(
     ComputeGraph& graph,
     const ValueRef packed_int8_input_a,
     const ValueRef packed_int8_input_b,
@@ -68,11 +45,15 @@ void add_q8ta_q8ta_q8to_binary_node(
     alpha_val = graph.extract_scalar<float>(alpha);
   }
 
-  std::string kernel_name = op_name + "_q8ta_q8ta_q8to";
+  std::string kernel_name = "q8ta_" + op_name;
   add_storage_type_suffix(
       kernel_name, graph.storage_type_of(packed_int8_output));
 
-  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)};
+  // Pass metadata for output and input tensors
+  vkapi::ParamsBindList param_buffers;
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_output));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_a));
+  param_buffers.append(graph.buffer_meta_ubo(packed_int8_input_b));
 
   std::vector<PushConstantDataInfo> push_constants = {
       PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)),
@@ -84,11 +65,19 @@ void add_q8ta_q8ta_q8to_binary_node(
       PushConstantDataInfo(&alpha_val, sizeof(alpha_val)),
   };
 
+  // Create block config for output tensor: inner_dim = output's packed_dim
+  const BlockConfig block_config =
+      create_block_config_for_tensor(graph, packed_int8_output);
+
+  // Cast block config to ValueRef for pick_linear_global_wg_with_block_config
+  const ValueRef block_config_ref =
+      static_cast<ValueRef>(block_config.as_packed_int());
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      pick_q8ta_q8ta_q8to_binary_global_wg_size,
-      default_pick_local_wg_size,
+      pick_linear_global_wg_with_block_config,
+      pick_square_local_wg_with_block_config,
       // Inputs and Outputs
       {{packed_int8_output, vkapi::kWrite},
        {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}},
@@ -97,9 +86,11 @@ void add_q8ta_q8ta_q8to_binary_node(
       // Push Constants
       push_constants,
       // Specialization Constants
-      {},
+      {graph.hashed_layout_of(packed_int8_output),
+       graph.hashed_layout_of(packed_int8_input_a),
+       block_config.as_packed_int()},
       // Resize args
-      {},
+      {block_config_ref},
       // Resizing Logic
       nullptr));
 }
@@ -108,9 +99,7 @@ void add_q8ta_q8ta_q8to_binary_node(
 // High level operator impl
 //
 
-void add_q8ta_q8ta_q8to(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
+void q8ta_add(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   int32_t idx = 0;
   const ValueRef packed_int8_input_a = args.at(idx++);
   const ValueRef packed_int8_input_b = args.at(idx++);
@@ -123,7 +112,7 @@ void add_q8ta_q8ta_q8to(
   const ValueRef alpha = args.at(idx++);
   const ValueRef packed_int8_output = args.at(idx++);
 
-  add_q8ta_q8ta_q8to_binary_node(
+  add_q8ta_binary_node(
       graph,
       packed_int8_input_a,
       packed_int8_input_b,
@@ -138,73 +127,8 @@ void add_q8ta_q8ta_q8to(
       "add");
 }
 
-//
-// Test operators
-//
-
-void add_q8ta_q8ta_q8to_test(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input_a = args.at(idx++);
-  const ValueRef fp_input_b = args.at(idx++);
-  const ValueRef input_a_scale = args.at(idx++);
-  const ValueRef input_a_zp = args.at(idx++);
-  const ValueRef input_b_scale = args.at(idx++);
-  const ValueRef input_b_zp = args.at(idx++);
-  const ValueRef output_scale = args.at(idx++);
-  const ValueRef output_zp = args.at(idx++);
-  const ValueRef alpha = args.at(idx++);
-  const ValueRef fp_output = args.at(idx++);
-
-  TmpTensor packed_int8_input_a(
-      &graph,
-      graph.sizes_of(fp_input_a),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  TmpTensor packed_int8_input_b(
-      &graph,
-      graph.sizes_of(fp_input_b),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  TmpTensor packed_int8_output(
-      &graph,
-      graph.sizes_of(fp_output),
-      vkapi::kInt8x4,
-      utils::kBuffer,
-      utils::kPackedInt8_4W4C);
-
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
-
-  add_quantize_and_pack_4w4c_node(
-      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
-
-  std::vector<ValueRef> add_args = {
-      packed_int8_input_a,
-      packed_int8_input_b,
-      input_a_scale,
-      input_a_zp,
-      input_b_scale,
-      input_b_zp,
-      output_scale,
-      output_zp,
-      alpha,
-      packed_int8_output};
-
-  add_q8ta_q8ta_q8to(graph, add_args);
-
-  add_unpack_4w4c_and_dequantize_node(
-      graph, packed_int8_output, output_scale, output_zp, fp_output);
-}
-
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to);
-  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test);
+  VK_REGISTER_OP(et_vk.q8ta_add.default, q8ta_add);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h
new file mode 100644
index 00000000000..512849762cb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+//
+// Binary operations for int8x4 tensors
+//
+
+void add_q8ta_binary_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input_a,
+    const ValueRef packed_int8_input_b,
+    const ValueRef input_a_scale,
+    const ValueRef input_a_zp,
+    const ValueRef input_b_scale,
+    const ValueRef input_b_zp,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef alpha,
+    const ValueRef packed_int8_output,
+    const std::string& op_name);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 0121c84bb5b..0777ad76a8d 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -99,8 +99,8 @@ if(TARGET vulkan_backend)
   add_operator_prototype(choose_qparams_per_row)
   add_operator_prototype(test_q8ta_qdq)
   add_operator_prototype(test_q8ta_clone)
+  add_operator_prototype(test_q8ta_add)
   add_operator_prototype(test_q8ta_conv2d)
   add_operator_prototype(test_q8ta_conv2d_pw)
   add_operator_prototype(test_q8ta_conv2d_dw)
-  add_operator_prototype(q8ta_q8ta_q8to_add)
 endif()
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
new file mode 100644
index 00000000000..53f8859b581
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
+
+namespace vkcompute {
+
+void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input_a = args.at(idx++);
+  const ValueRef fp_input_b = args.at(idx++);
+  const ValueRef input_a_scale = args.at(idx++);
+  const ValueRef input_a_zp = args.at(idx++);
+  const ValueRef input_b_scale = args.at(idx++);
+  const ValueRef input_b_zp = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef alpha = args.at(idx++);
+  const ValueRef quant_layout_int = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(quant_layout_int);
+  utils::GPUMemoryLayout quant_layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
+  // Create temporary tensors for quantized data with the specified layout
+  TmpTensor packed_int8_input_a(
+      &graph,
+      graph.sizes_of(fp_input_a),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  TmpTensor packed_int8_input_b(
+      &graph,
+      graph.sizes_of(fp_input_b),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      quant_layout);
+
+  // Quantize: FP -> int8x4 with specified layout
+  add_q8ta_quantize_node(
+      graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
+
+  add_q8ta_quantize_node(
+      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+
+  // Binary add: int8x4 -> int8x4 (same layout for all tensors)
+  add_q8ta_binary_node(
+      graph,
+      packed_int8_input_a,
+      packed_int8_input_b,
+      input_a_scale,
+      input_a_zp,
+      input_b_scale,
+      input_b_zp,
+      output_scale,
+      output_zp,
+      alpha,
+      packed_int8_output,
+      "add");
+
+  // Dequantize: int8x4 -> FP
+  add_q8ta_dequantize_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.q8ta_add.test, q8ta_add_test);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
deleted file mode 100644
index eb8e6908060..00000000000
--- a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-using namespace executorch::vulkan::prototyping;
-
-// Utility function to create a test case for quantized add operation
-TestCase create_quantized_add_test_case(
-    const std::vector<int64_t>& sizes,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string size_str = "";
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    size_str += std::to_string(sizes[i]);
-    if (i < sizes.size() - 1)
-      size_str += "x";
-  }
-
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test");
-
-  utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
-      ? utils::kWidthPacked
-      : utils::kChannelsPacked;
-
-  // Input tensor A (float/half)
-  ValueSpec input_a(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
-
-  // Input tensor B (float/half)
-  ValueSpec input_b(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
-
-  // Quantization parameters for input A
-  float input_a_scale_val = 0.007843; // 2/255 approximately
-  ValueSpec input_a_scale(input_a_scale_val);
-
-  int32_t input_a_zero_point_val = 3;
-  ValueSpec input_a_zero_point(input_a_zero_point_val);
-
-  // Quantization parameters for input B
-  float input_b_scale_val = 0.009412; // 2.4/255 approximately
-  ValueSpec input_b_scale(input_b_scale_val);
-
-  int32_t input_b_zero_point_val = -2;
-  ValueSpec input_b_zero_point(input_b_zero_point_val);
-
-  // Output quantization parameters
-  float output_scale_val = 0.015686; // 4/255 approximately
-  ValueSpec output_scale(output_scale_val);
-
-  int32_t output_zero_point_val = 1;
-  ValueSpec output_zero_point(output_zero_point_val);
-
-  // Alpha parameter
-  float alpha_val = 1.0f;
-  ValueSpec alpha(alpha_val);
-
-  // Output tensor (float/half)
-  ValueSpec output(
-      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS);
-
-  // Add all specs to test case for q8ta_q8ta_q8to add operation
-  test_case.add_input_spec(input_a);
-  test_case.add_input_spec(input_b);
-  test_case.add_input_spec(input_a_scale);
-  test_case.add_input_spec(input_a_zero_point);
-  test_case.add_input_spec(input_b_scale);
-  test_case.add_input_spec(input_b_zero_point);
-  test_case.add_input_spec(output_scale);
-  test_case.add_input_spec(output_zero_point);
-  test_case.add_input_spec(alpha);
-
-  test_case.add_output_spec(output);
-
-  test_case.set_abs_tolerance(output_scale_val + 1e-4f);
-
-  return test_case;
-}
-
-// Generate test cases for quantized add operation
-std::vector<TestCase> generate_quantized_add_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Define different input size configurations
-  std::vector<std::vector<int64_t>> size_configs = {
-      {3, 32, 32}, // Small square
-      {8, 64, 64}, // Medium square
-      {16, 16, 16}, // 3D cube
-      {8, 32, 16}, // 3D rectangular
-      {7, 7, 13}, // Irregular sizes
-  };
-
-  // Storage types to test
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  // Data types to test
-  std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& sizes : size_configs) {
-    for (const auto& storage_type : storage_types) {
-      for (const auto& data_type : data_types) {
-        test_cases.push_back(
-            create_quantized_add_test_case(sizes, storage_type, data_type));
-      }
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for quantized add operation
-void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) {
-  // Extract input specifications
-  int32_t idx = 0;
-  const ValueSpec& input_a_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_b_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++];
-  const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++];
-  const ValueSpec& alpha_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_a_spec.get_tensor_sizes();
-  int64_t num_elements = input_a_spec.numel();
-
-  if (input_a_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_a_data = input_a_spec.get_float_data();
-  auto& input_b_data = input_b_spec.get_float_data();
-
-  const float input_a_scale = input_a_scale_spec.get_float_value();
-  const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value();
-  const float input_b_scale = input_b_scale_spec.get_float_value();
-  const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value();
-  const float output_scale = output_scale_spec.get_float_value();
-  const int32_t output_zero_point = output_zero_point_spec.get_int_value();
-  const float alpha = alpha_spec.get_float_value();
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_elements);
-
-  // Perform quantized add operation
-  for (int64_t i = 0; i < num_elements; ++i) {
-    // Quantize input A to int8
-    float quant_a_f =
-        std::round(input_a_data[i] / input_a_scale) + input_a_zero_point;
-    quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f);
-    int8_t quantized_a = static_cast<int8_t>(quant_a_f);
-
-    // Quantize input B to int8
-    float quant_b_f =
-        std::round(input_b_data[i] / input_b_scale) + input_b_zero_point;
-    quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
-    int8_t quantized_b = static_cast<int8_t>(quant_b_f);
-
-    // Dequantize both inputs to a common scale for addition
-    float dequant_a =
-        (static_cast<float>(quantized_a) - input_a_zero_point) * input_a_scale;
-    float dequant_b =
-        (static_cast<float>(quantized_b) - input_b_zero_point) * input_b_scale;
-
-    // Perform addition in float space with alpha
-    float float_result = dequant_a + alpha * dequant_b;
-
-    // Quantize the result to int8
-    float quant_output_f =
-        std::round(float_result / output_scale) + output_zero_point;
-    quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
-    int8_t quantized_output = static_cast<int8_t>(quant_output_f);
-
-    // Dequantize back to float for comparison
-    float dequant_output =
-        (static_cast<float>(quantized_output) - output_zero_point) *
-        output_scale;
-
-    ref_data[i] = dequant_output;
-  }
-}
-
-void reference_impl(TestCase& test_case) {
-  add_q8ta_q8ta_q8to_reference_impl(test_case);
-}
-
-// Custom FLOP calculator for quantized add operation
-int64_t quantized_add_flop_calculator(const TestCase& test_case) {
-  // Calculate total elements from the first input tensor
-  int64_t total_elements = 1;
-  if (!test_case.empty() && test_case.num_inputs() > 0 &&
-      test_case.inputs()[0].is_tensor()) {
-    const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-    for (int64_t size : sizes) {
-      total_elements *= size;
-    }
-  }
-
-  // Quantized add operation includes:
-  // - 2 quantizations (float to int8)
-  // - 2 dequantizations (int8 to float)
-  // - 1 addition
-  // For simplicity, we count this as 1 FLOP per element (the addition)
-  return total_elements;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework"
-            << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = reference_impl;
-
-  // Execute test cases using the new framework with custom FLOP calculator
-  auto results = execute_test_cases(
-      generate_quantized_add_test_cases,
-      quantized_add_flop_calculator,
-      "QuantizedAddQ8taQ8taQ8to",
-      0,
-      1,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 63423ed410f..73b1e343bbe 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -93,7 +93,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q4gsw_linear")
     define_custom_op_test_binary("test_q8ta_qdq")
     define_custom_op_test_binary("test_q8ta_clone")
+    define_custom_op_test_binary("test_q8ta_binary")
     define_custom_op_test_binary("test_q8ta_conv2d")
     define_custom_op_test_binary("test_q8ta_conv2d_pw")
     define_custom_op_test_binary("test_q8ta_conv2d_dw")
-    define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
new file mode 100644
index 00000000000..1cb364c6f8d
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_q8ta_binary.cpp
@@ -0,0 +1,373 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// Configuration struct for q8ta binary testing
+struct Q8taBinaryConfig {
+  std::vector<int64_t> shape; // Tensor shape (can be any dimensionality)
+  std::string test_case_name = "placeholder";
+  std::string op_name = "q8ta_add";
+};
+
+// Utility function to create a test case from a Q8taBinaryConfig
+TestCase create_test_case_from_config(
+    const Q8taBinaryConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype,
+    utils::GPUMemoryLayout fp_memory_layout,
+    utils::GPUMemoryLayout quant_layout) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string shape_str = shape_string(config.shape);
+  std::string test_name = config.test_case_name + "  I=" + shape_str + "  " +
+      repr_str(storage_type, fp_memory_layout) + "->" +
+      repr_str(utils::kBuffer, quant_layout);
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "et_vk." + config.op_name + ".test";
+  test_case.set_operator_name(operator_name);
+
+  // Input tensor A (float/half)
+  ValueSpec input_a(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
+
+  // Input tensor B (float/half)
+  ValueSpec input_b(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::RANDOM);
+
+  // Quantization parameters for input A
+  float input_a_scale_val = 0.007843; // 2/255 approximately
+  ValueSpec input_a_scale(input_a_scale_val);
+
+  int32_t input_a_zero_point_val = 3;
+  ValueSpec input_a_zero_point(input_a_zero_point_val);
+
+  // Quantization parameters for input B
+  float input_b_scale_val = 0.009412; // 2.4/255 approximately
+  ValueSpec input_b_scale(input_b_scale_val);
+
+  int32_t input_b_zero_point_val = -2;
+  ValueSpec input_b_zero_point(input_b_zero_point_val);
+
+  // Output quantization parameters
+  float output_scale_val = 0.015686; // 4/255 approximately
+  ValueSpec output_scale(output_scale_val);
+
+  int32_t output_zero_point_val = 1;
+  ValueSpec output_zero_point(output_zero_point_val);
+
+  // Alpha parameter
+  float alpha_val = 1.0f;
+  ValueSpec alpha(alpha_val);
+
+  // Quantized layout as integer
+  int32_t quant_layout_int = static_cast<int32_t>(quant_layout);
+  ValueSpec quant_layout_spec(quant_layout_int);
+
+  // Output tensor (float/half)
+  ValueSpec output(
+      config.shape,
+      input_dtype,
+      storage_type,
+      fp_memory_layout,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case for q8ta add operation
+  test_case.add_input_spec(input_a);
+  test_case.add_input_spec(input_b);
+  test_case.add_input_spec(input_a_scale);
+  test_case.add_input_spec(input_a_zero_point);
+  test_case.add_input_spec(input_b_scale);
+  test_case.add_input_spec(input_b_zero_point);
+  test_case.add_input_spec(output_scale);
+  test_case.add_input_spec(output_zero_point);
+  test_case.add_input_spec(alpha);
+  test_case.add_input_spec(quant_layout_spec);
+
+  test_case.add_output_spec(output);
+
+  test_case.set_abs_tolerance(output_scale_val + 1e-4f);
+
+  // Use layout-only filter to focus on the binary operation
+  test_case.set_shader_filter({
+      "nchw_to",
+      "to_nchw",
+      "q8ta_quantize",
+      "q8ta_dequantize",
+  });
+
+  return test_case;
+}
+
+// Generate easy test cases for q8ta_add operation (for debugging)
+std::vector<TestCase> generate_q8ta_add_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging
+  Q8taBinaryConfig config = {
+      {1, 16, 16, 16}, // shape: [N, C, H, W]
+      "ACCU", // test_case_name
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& fp_layout : fp_layouts) {
+    for (const auto& quant_layout : quant_layouts) {
+      for (const auto& storage_type : storage_types) {
+        for (const auto& input_dtype : float_types) {
+          test_cases.push_back(create_test_case_from_config(
+              config, storage_type, input_dtype, fp_layout, quant_layout));
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for q8ta_add operation
+std::vector<TestCase> generate_q8ta_add_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Shapes to test
+  std::vector<std::vector<int64_t>> shapes = {
+      // Small test cases for correctness
+      {1, 3, 16, 16},
+      {1, 8, 32, 32},
+      {1, 16, 24, 24},
+      {1, 32, 12, 12},
+      {1, 1, 64, 64},
+      {1, 3, 64, 64},
+      {1, 4, 16, 16},
+
+      // Different tensor sizes
+      {1, 8, 20, 20},
+      {1, 16, 14, 14},
+      {1, 8, 28, 28},
+
+      // Odd tensor sizes
+      {1, 3, 15, 15},
+      {1, 13, 31, 31},
+      {1, 17, 23, 23},
+
+      // Performance test cases (larger tensors)
+      {1, 64, 128, 128},
+      {1, 32, 64, 64},
+      {1, 128, 56, 56},
+      {1, 128, 128, 128},
+  };
+
+  // FP memory layouts to test
+  std::vector<utils::GPUMemoryLayout> fp_layouts = {
+      utils::kWidthPacked,
+      utils::kChannelsPacked,
+  };
+
+  // Quantized memory layouts to test
+  std::vector<utils::GPUMemoryLayout> quant_layouts = {
+      utils::kPackedInt8_4W,
+      utils::kPackedInt8_4C,
+      utils::kPackedInt8_4W4C,
+      utils::kPackedInt8_4H4W,
+      utils::kPackedInt8_4C1W,
+  };
+
+  // Test with buffer storage only
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+
+  // Generate all combinations
+  for (const auto& shape : shapes) {
+    // Generate test case name prefix from shape dimensions
+    std::string prefix = "ACCU";
+    for (const auto& dim : shape) {
+      if (dim > kRefDimSizeLimit) {
+        prefix = "PERF";
+        break;
+      }
+    }
+
+    for (const auto& fp_layout : fp_layouts) {
+      for (const auto& quant_layout : quant_layouts) {
+        for (const auto& storage_type : storage_types) {
+          Q8taBinaryConfig config;
+          config.shape = shape;
+          config.test_case_name = prefix;
+
+          test_cases.push_back(create_test_case_from_config(
+              config, storage_type, vkapi::kFloat, fp_layout, quant_layout));
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for quantized add operation
+void q8ta_add_reference_impl(TestCase& test_case) {
+  int32_t idx = 0;
+  const ValueSpec& input_a_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& alpha_spec = test_case.inputs()[idx++];
+  const ValueSpec& quant_layout_spec = test_case.inputs()[idx++];
+  (void)quant_layout_spec; // Not used in reference implementation
+
+  // Extract output specification
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_a_spec.get_tensor_sizes();
+
+  // Calculate total number of elements
+  int64_t num_elements = 1;
+  for (const auto& dim : input_sizes) {
+    num_elements *= dim;
+  }
+
+  // Skip for large tensors since computation time will be extremely slow
+  for (const auto& dim : input_sizes) {
+    if (dim > kRefDimSizeLimit) {
+      throw std::invalid_argument(
+          "One or more dimensions exceed the allowed limit for reference "
+          "implementation.");
+    }
+  }
+
+  if (input_a_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_a_data = input_a_spec.get_float_data();
+  auto& input_b_data = input_b_spec.get_float_data();
+
+  const float input_a_scale = input_a_scale_spec.get_float_value();
+  const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value();
+  const float input_b_scale = input_b_scale_spec.get_float_value();
+  const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value();
+  const float output_scale = output_scale_spec.get_float_value();
+  const int32_t output_zero_point = output_zero_point_spec.get_int_value();
+  const float alpha = alpha_spec.get_float_value();
+
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_elements);
+
+  // Perform quantized add operation
+  for (int64_t i = 0; i < num_elements; ++i) {
+    // Quantize input A to int8
+    float quant_a_f =
+        std::round(input_a_data[i] / input_a_scale) + input_a_zero_point;
+    quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f);
+    int8_t quantized_a = static_cast<int8_t>(quant_a_f);
+
+    // Quantize input B to int8
+    float quant_b_f =
+        std::round(input_b_data[i] / input_b_scale) + input_b_zero_point;
+    quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
+    int8_t quantized_b = static_cast<int8_t>(quant_b_f);
+
+    // Dequantize both inputs to a common scale for addition
+    float dequant_a =
+        (static_cast<float>(quantized_a) - input_a_zero_point) * input_a_scale;
+    float dequant_b =
+        (static_cast<float>(quantized_b) - input_b_zero_point) * input_b_scale;
+
+    // Perform addition in float space with alpha
+    float float_result = dequant_a + alpha * dequant_b;
+
+    // Quantize the result to int8
+    float quant_output_f =
+        std::round(float_result / output_scale) + output_zero_point;
+    quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
+    int8_t quantized_output = static_cast<int8_t>(quant_output_f);
+
+    // Dequantize back to float for comparison
+    float dequant_output =
+        (static_cast<float>(quantized_output) - output_zero_point) *
+        output_scale;
+
+    ref_data[i] = dequant_output;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+#ifdef DEBUG_MODE
+  set_print_latencies(false);
+#else
+  set_print_latencies(false);
+#endif
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Q8TA Binary Add Operation Prototyping Framework" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = q8ta_add_reference_impl;
+
+  auto results = execute_test_cases(
+#ifdef DEBUG_MODE
+      generate_q8ta_add_easy_cases,
+#else
+      generate_q8ta_add_test_cases,
+#endif
+      "Q8taBinaryAdd",
+#ifdef DEBUG_MODE
+      0,
+      1,
+#else
+      3,
+      10,
+#endif
+      ref_fn);
+
+  return 0;
+}