From 8786ba69ce78fe8ea9e7626ffff59fb1565559bf Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Fri, 27 Mar 2026 11:22:01 -0700
Subject: [PATCH] [ET-VK][conv1d] Route conv1d to height-packed implementations
 in export pipeline

Pull Request resolved: https://github.com/pytorch/executorch/pull/18334

Integrate the new height-packed conv1d_pw and conv1d_dw operators into the
aten.convolution.default dispatch path so they are automatically used during
model export.

In op_registry.py, add a pick_conv_storage function that inspects the
convolution node at partition time. For 1D convolutions where the op is
pointwise (kernel_size=1) or depthwise (groups=C_in) and channels are 4-aligned,
it selects HEIGHT_PACKED_TEXTURE for input/output instead of the default
CHANNELS_PACKED_TEXTURE. All other cases (conv2d, grouped conv1d with K>1,
unaligned channels) retain channels-packed behavior.

In Convolution.cpp, add a height-packed routing block at the top of the conv1d
path. When the input tensor is height-packed, it dispatches to
et_vk.conv1d_pw.default or et_vk.conv1d_dw.default via VK_GET_OP_FN. Falls
through to the existing channels-packed add_conv1d_node path otherwise.
ghstack-source-id: 358903217
@exported-using-ghexport

Differential Revision: [D97344090](https://our.internmc.facebook.com/intern/diff/D97344090/)
---
 backends/vulkan/op_registry.py                | 42 ++++++++++++++++
 .../runtime/graph/ops/impl/Convolution.cpp    | 50 +++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index ddb843e2335..38215c2d827 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -802,6 +802,47 @@ def check_conv_node(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_conv_storage(
+        node: torch.fx.Node,
+    ) -> Tuple[List[utils.TensorRepSet], utils.TensorRepSet]:
+        x = node.args[0]
+        assert isinstance(x, torch.fx.Node)
+        x_shape = x.meta["val"].size()
+
+        # Default: channels-packed texture (conv2d and fallback conv1d)
+        input_storage = utils.CHANNELS_PACKED_TEXTURE
+        output_storage = utils.CHANNELS_PACKED_TEXTURE
+
+        if len(x_shape) == 3:
+            # Conv1d: check if we can use height-packed
+            weight = node.args[1]
+            assert isinstance(weight, torch.fx.Node)
+            w_shape = weight.meta["val"].size()
+            groups = node.args[8]
+
+            c_in = x_shape[1]
+            c_out = w_shape[0]
+            kernel_size = w_shape[2]
+
+            is_pointwise = kernel_size == 1
+            is_depthwise = (
+                isinstance(groups, int)
+                and groups == c_in
+                and c_out == c_in
+                and w_shape[1] == 1
+            )
+            if is_pointwise or is_depthwise:
+                input_storage = utils.HEIGHT_PACKED_TEXTURE
+                output_storage = utils.HEIGHT_PACKED_TEXTURE
+
+        # Build per-input storage list. The convolution op has variable args:
+        # aten.convolution.default: input, weight, bias, stride, padding,
+        #   dilation, transposed, output_padding, groups
+        # et_vk.conv_with_clamp.default: + output_min, output_max
+        # All args after input are NO_STORAGE (prepacked or non-tensor)
+        inputs = [input_storage] + [utils.NO_STORAGE] * 10
+        return inputs, output_storage
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -820,6 +861,7 @@ def check_conv_node(node: torch.fx.Node) -> bool:
         supports_resize=True,
         supports_prepacking=True,
         are_node_inputs_supported_fn=check_conv_node,
+        pick_io_storage_fn=pick_conv_storage,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 2da98926fad..9c518678502 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -686,6 +686,56 @@ void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
           true);
     }
   } else {
+    // Conv1d path
+    if (graph.packed_dim_of(args[0]) == WHCN::kHeightDim) {
+      // Height-packed: route to optimized conv1d implementations
+      const auto weight_sizes = graph.sizes_of(args[1]);
+      const int64_t groups_val = graph.get_int(args[8]);
+      const bool is_pointwise = weight_sizes.at(2) == 1;
+      const bool is_depthwise =
+          groups_val == weight_sizes.at(0) && weight_sizes.at(1) == 1;
+
+      // Build unified 10-arg vector:
+      //   in, weight, bias, stride, padding, dilation, groups,
+      //   output_min, output_max, out
+      // For non-clamp (args.size() == 10): output_min/max = kDummyValueRef
+      // For clamp (args.size() == 12): output_min/max from args[9]/args[10]
+      ValueRef output_min = kDummyValueRef;
+      ValueRef output_max = kDummyValueRef;
+      ValueRef out;
+      if (args.size() == 10) {
+        out = args[9];
+      } else {
+        output_min = args[9];
+        output_max = args[10];
+        out = args[11];
+      }
+
+      std::vector<ValueRef> conv1d_args = {
+          args[0],
+          args[1],
+          args[2],
+          args[3],
+          args[4],
+          args[5],
+          args[8],
+          output_min,
+          output_max,
+          out};
+
+      if (is_pointwise) {
+        VK_GET_OP_FN("et_vk.conv1d_pw.default")(graph, conv1d_args);
+      } else if (is_depthwise) {
+        VK_GET_OP_FN("et_vk.conv1d_dw.default")(graph, conv1d_args);
+      } else {
+        VK_THROW(
+            "Height-packed conv1d only supports pointwise (K=1) or "
+            "depthwise (groups=C)");
+      }
+      return;
+    }
+
+    // Existing channels-packed fallback
     if (args.size() == 10) {
       // ordinary conv1d
       return add_conv1d_node(