From f21bfb9fe3f734557b3a222c251158b24db98f18 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 11 Feb 2026 12:15:52 -0800
Subject: [PATCH] [ET-VK][ez] Make q8ta_conv2d use 4C1W layout

This changes the q8ta_conv2d and q8ta_conv2d_dw operators' input layout from PackedInt8_4W4C to PackedInt8_4C1W in the op registry. The 4C1W layout aligns with the natural output format of channel-packed convolutions, avoiding unnecessary layout conversions between consecutive conv layers.

Also adds explicit `outputs_storage` declarations (PACKED_INT8_CHANNELS_PACKED_BUFFER) to both the PW and general q8ta_conv2d op registrations, ensuring the layout propagation pass can correctly determine output layouts.

Differential Revision: [D93000165](https://our.internmc.facebook.com/intern/diff/D93000165/)

[ghstack-poisoned]
---
 backends/vulkan/op_registry.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index bc5b83b1b55..28df2e7001d 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -770,6 +770,9 @@ def register_q8ta_conv_pw_op():
             utils.NO_STORAGE,  # groups (non tensor)
             utils.NO_STORAGE,  # original OC count (non tensor)
         ],
+        outputs_storage=[
+            utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
+        ],
         supports_resize=False,
         supports_prepacking=True,
     )
@@ -784,7 +787,7 @@ def register_q8ta_conv_pw_op():
 def register_q8ta_conv2d_ops():
     return OpFeatures(
         inputs_storage=[
-            utils.PACKED_INT8_4W4C_BUFFER,  # input
+            utils.PACKED_INT8_4C1W_BUFFER,  # input
             utils.NO_STORAGE,  # input_scale (non tensor)
             utils.NO_STORAGE,  # input_zero_point (non tensor)
             utils.NO_STORAGE,  # weight (prepacked)
@@ -800,6 +803,9 @@ def register_q8ta_conv2d_ops():
             utils.NO_STORAGE,  # groups (non tensor)
             utils.NO_STORAGE,  # original OC count (non tensor)
         ],
+        outputs_storage=[
+            utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
+        ],
         supports_resize=False,
         supports_prepacking=True,
     )