From febb5806fc5a876f11224ecb53926ee666244f4d Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Fri, 27 Mar 2026 11:21:59 -0700
Subject: [PATCH 1/2] [ET-VK][conv1d] Implement height-packed depthwise conv1d
 operator

Pull Request resolved: https://github.com/pytorch/executorch/pull/18333

Implement a depthwise conv1d operator using height-packed layout where channels
are the packed dimension (WHCN dim 1). Depthwise conv applies a separate filter
to each channel independently (groups=C), so 4 channels can be processed in
parallel using element-wise vec4 FMA over kernel positions.

Thread mapping: X=C/4, Y=L_out, Z=N. Each thread computes one output texel
(4 channels at one spatial position). Inner loop iterates over kernel positions
K with bounds-checked input access for padding.

Weight [C,1,K] is prepacked as channels-packed so each vec4 load gives 4
channels' weights at one kernel position. Supports both buffer and texture3d
storage, fp32/fp16, optional bias, and arbitrary stride/padding/dilation.
Registered as et_vk.conv1d_dw.default (standalone custom op).

Performance on Adreno 750 (S24):
- [1,128,4096] K=31 buffer f16: 231 GFLOP/s
- [1,128,4096] K=31 buffer f32: 155 GFLOP/s
- [1,512,2048] K=5 buffer f32: 66 GFLOP/s
ghstack-source-id: 358903219
@exported-using-ghexport

Differential Revision: [D97344091](https://our.internmc.facebook.com/intern/diff/D97344091/)
---
 .../runtime/graph/ops/glsl/conv1d_dw.glsl     | 127 +++++++++
 .../runtime/graph/ops/glsl/conv1d_dw.yaml     |  22 ++
 .../runtime/graph/ops/impl/Conv1dDW.cpp       | 188 ++++++++++++
 .../test/custom_ops/impl/TestConv1dDW.cpp     |  46 +++
 backends/vulkan/test/custom_ops/targets.bzl   |   1 +
 .../vulkan/test/custom_ops/test_conv1d_dw.cpp | 267 ++++++++++++++++++
 6 files changed, 651 insertions(+)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_conv1d_dw.cpp

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl
new file mode 100644
index 00000000000..7ea068af93c
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define T ${texel_load_component_type(DTYPE, STORAGE)}
+
+$if STORAGE == "buffer":
+  #define BUFFER
+  #define SCALAR_BUFFER
+$if HAS_BIAS:
+  #define HAS_BIAS
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+$if STORAGE == "buffer":
+  ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=True)}
+  ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=True)}
+  ${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE, is_scalar_array=True)}
+$else:
+  ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)}
+  ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)}
+  ${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE, is_scalar_array=False)}
+$if HAS_BIAS:
+  $if STORAGE == "buffer":
+    ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=True)}
+  $else:
+    ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=False)}
+
+// in_sizes: {L_in, C, N, 1} in WHCN order
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+// out_sizes: {L_out, C, N, 1} in WHCN order
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  int kernel_size;
+  int stride;
+  int padding;
+  int dilation;
+  float output_min;
+  float output_max;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Thread mapping: X = C/4, Y = L_out, Z = N
+// Each thread computes 4 output channels at one spatial position.
+// Depthwise: each channel has its own filter, so 4 channels can be computed
+// independently with element-wise vec4 FMA.
+
+void main() {
+  const int c4 = int(gl_GlobalInvocationID.x);
+  const int l_out = int(gl_GlobalInvocationID.y);
+  const int n = int(gl_GlobalInvocationID.z);
+
+  const int L_in = in_sizes.x;
+  const int C = in_sizes.y;
+  const int C4 = div_up_4(C);
+  const int L_out = out_sizes.x;
+
+  if (c4 >= C4 || l_out >= L_out) {
+    return;
+  }
+
+  VEC4_T sum = VEC4_T(0);
+
+  for (int k = 0; k < kernel_size; k++) {
+    const int l_in = l_out * stride - padding + k * dilation;
+    if (l_in >= 0 && l_in < L_in) {
+#ifdef BUFFER
+      const int in_base = (n * L_in + l_in) * C + c4 * 4;
+      T in_s0 = t_in[in_base];
+      T in_s1 = (c4 * 4 + 1 < C) ? t_in[in_base + 1] : T(0);
+      T in_s2 = (c4 * 4 + 2 < C) ? t_in[in_base + 2] : T(0);
+      T in_s3 = (c4 * 4 + 3 < C) ? t_in[in_base + 3] : T(0);
+      const VEC4_T in_val = VEC4_T(in_s0, in_s1, in_s2, in_s3);
+
+      const int w_base = k * C + c4 * 4;
+      T w_s0 = t_weight[w_base];
+      T w_s1 = (c4 * 4 + 1 < C) ? t_weight[w_base + 1] : T(0);
+      T w_s2 = (c4 * 4 + 2 < C) ? t_weight[w_base + 2] : T(0);
+      T w_s3 = (c4 * 4 + 3 < C) ? t_weight[w_base + 3] : T(0);
+      const VEC4_T w_val = VEC4_T(w_s0, w_s1, w_s2, w_s3);
+#else
+      const VEC4_T in_val = texelFetch(t_in, ivec3(l_in, c4, n), 0);
+      const VEC4_T w_val = texelFetch(t_weight, ivec3(k, 0, c4), 0);
+#endif
+      sum = fma(w_val, in_val, sum);
+    }
+  }
+
+#ifdef HAS_BIAS
+#ifdef BUFFER
+  const int bias_base = c4 * 4;
+  T b0 = t_bias[bias_base];
+  T b1 = (bias_base + 1 < C) ? t_bias[bias_base + 1] : T(0);
+  T b2 = (bias_base + 2 < C) ? t_bias[bias_base + 2] : T(0);
+  T b3 = (bias_base + 3 < C) ? t_bias[bias_base + 3] : T(0);
+  sum += VEC4_T(b0, b1, b2, b3);
+#else
+  sum += texelFetch(t_bias, ivec3(c4, 0, 0), 0);
+#endif
+#endif
+
+  sum = clamp(sum, VEC4_T(output_min), VEC4_T(output_max));
+
+#ifdef BUFFER
+  const int out_base = (n * L_out + l_out) * C + c4 * 4;
+  t_out[out_base] = sum.x;
+  if (c4 * 4 + 1 < C) t_out[out_base + 1] = sum.y;
+  if (c4 * 4 + 2 < C) t_out[out_base + 2] = sum.z;
+  if (c4 * 4 + 3 < C) t_out[out_base + 3] = sum.w;
+#else
+  imageStore(t_out, ivec3(l_out, c4, n), sum);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml
new file mode 100644
index 00000000000..883ad8899ea
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv1d_dw:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+    HAS_BIAS: false
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: texture3d
+      - VALUE: buffer
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: conv1d_dw
+    - NAME: conv1d_dw_bias
+      HAS_BIAS: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp
new file mode 100644
index 00000000000..88d421e6994
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <limits>
+
+namespace vkcompute {
+
+void resize_conv1d_dw_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0));
+
+  const int64_t stride = graph->get_int_list(extra_args.at(1))->at(0);
+  const int64_t padding = graph->get_int_list(extra_args.at(2))->at(0);
+  const int64_t dilation = graph->get_int_list(extra_args.at(3))->at(0);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(self);
+  const int64_t kernel_size = weight_ref->sizes.at(2);
+  const int64_t L_in = in_sizes.at(2);
+
+  const int64_t L_out =
+      calc_out_size(L_in, kernel_size, stride, padding, dilation, false);
+
+  graph->virtual_resize(out, {in_sizes.at(0), in_sizes.at(1), L_out});
+}
+
+struct Conv1dDWParams final {
+  int32_t kernel_size;
+  int32_t stride;
+  int32_t padding;
+  int32_t dilation;
+};
+
+struct Conv1dDWClampParams final {
+  float output_min;
+  float output_max;
+};
+
+utils::uvec3 pick_conv1d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  // out is [N, C, L_out]; in WHCN: {L_out, C, N, 1}
+  const uint32_t C = graph->size_at<uint32_t>(-2, out);
+  const uint32_t L_out = graph->size_at<uint32_t>(-1, out);
+  const uint32_t N =
+      graph->dim_of(out) >= 3 ? graph->size_at<uint32_t>(-3, out) : 1;
+
+  return {utils::div_up_4(C), L_out, N};
+}
+
+void add_conv1d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride_ref,
+    const ValueRef padding_ref,
+    const ValueRef dilation_ref,
+    const ValueRef out,
+    const float output_min = std::numeric_limits<float>::lowest(),
+    const float output_max = std::numeric_limits<float>::max()) {
+  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
+  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
+
+  const utils::StorageType storage_type = graph.storage_type_of(out);
+
+  // Weight [C, 1, K] prepacked as channels-packed so each vec4 load gives
+  // 4 channels at one kernel position.
+  ValueRef packed_weight = prepack_standard(
+      graph, weight_data, storage_type, utils::kChannelsPacked);
+
+  bool has_bias = graph.val_is_not_none(bias);
+  ValueRef packed_bias = kDummyValueRef;
+  if (has_bias) {
+    packed_bias =
+        prepack_standard(graph, bias, storage_type, utils::kWidthPacked);
+  }
+
+  const auto stride_val = graph.get_int_list(stride_ref)->at(0);
+  const auto padding_val = graph.get_int_list(padding_ref)->at(0);
+  const auto dilation_val = graph.get_int_list(dilation_ref)->at(0);
+
+  Conv1dDWParams params{
+      utils::safe_downcast<int32_t>(graph.get_tref(weight_data)->sizes.at(2)),
+      utils::safe_downcast<int32_t>(stride_val),
+      utils::safe_downcast<int32_t>(padding_val),
+      utils::safe_downcast<int32_t>(dilation_val),
+  };
+
+  Conv1dDWClampParams clamp_params{
+      output_min,
+      output_max,
+  };
+
+  std::string kernel_name = has_bias ? "conv1d_dw_bias" : "conv1d_dw";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, storage_type);
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  std::vector<ValueRef> read_inputs = {in, packed_weight};
+  if (has_bias) {
+    read_inputs.push_back(packed_bias);
+  }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_conv1d_dw_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}},
+      // Shader params buffers
+      {graph.sizes_ubo(in), graph.sizes_ubo(out)},
+      // Push Constants
+      {PushConstantDataInfo(&params, sizeof(Conv1dDWParams)),
+       PushConstantDataInfo(&clamp_params, sizeof(Conv1dDWClampParams))},
+      // Specialization Constants
+      {},
+      // Resize Args
+      {weight_data, stride_ref, padding_ref, dilation_ref},
+      // Resizing Logic
+      resize_conv1d_dw_node));
+}
+
+// Args: in, weight, bias, stride, padding, dilation, groups,
+//       output_min, output_max, out
+// output_min and output_max may be kDummyValueRef (no clamp).
+void conv1d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef weight = args[1];
+  ValueRef bias = args[2];
+  ValueRef stride = args[3];
+  ValueRef padding = args[4];
+  ValueRef dilation = args[5];
+  ValueRef out = args[9];
+
+  float output_min = std::numeric_limits<float>::lowest();
+  float output_max = std::numeric_limits<float>::max();
+  if (is_valid(args[7])) {
+    output_min = graph.extract_scalar<float>(args[7]);
+  }
+  if (is_valid(args[8])) {
+    output_max = graph.extract_scalar<float>(args[8]);
+  }
+
+  add_conv1d_dw_node(
+      graph,
+      in,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      out,
+      output_min,
+      output_max);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.conv1d_dw.default, conv1d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp
new file mode 100644
index 00000000000..15923462a20
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+void test_conv1d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args: in, weight, bias, stride, padding, dilation, groups, out
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const ValueRef stride = args.at(3);
+  const ValueRef padding = args.at(4);
+  const ValueRef dilation = args.at(5);
+  const ValueRef groups = args.at(6);
+  const ValueRef out = args.at(7);
+
+  // conv1d_dw expects: in, weight, bias, stride, padding, dilation, groups,
+  //                    output_min, output_max, out
+  VK_GET_OP_FN("et_vk.conv1d_dw.default")
+  (graph,
+   {input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    kDummyValueRef,
+    kDummyValueRef,
+    out});
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv1d_dw.default, test_conv1d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index d535ca2661c..5fb0f7f4cbf 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -104,3 +104,4 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_conv2d_dw")
     define_custom_op_test_binary("test_embedding_q4gsw")
     define_custom_op_test_binary("test_conv1d_pw")
+    define_custom_op_test_binary("test_conv1d_dw")
diff --git a/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp
new file mode 100644
index 00000000000..2438847036e
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp
@@ -0,0 +1,267 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 256;
+
+struct Conv1dDWConfig {
+  int64_t N;
+  int64_t C;
+  int64_t L;
+  int64_t K;
+  int64_t stride;
+  int64_t padding;
+  int64_t dilation;
+  bool has_bias;
+};
+
+static TestCase create_conv1d_dw_test_case(
+    const Conv1dDWConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type) {
+  TestCase test_case;
+
+  bool is_perf = config.C > kRefDimSizeLimit || config.L > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  int64_t L_out =
+      (config.L + 2 * config.padding - config.dilation * (config.K - 1) - 1) /
+          config.stride +
+      1;
+
+  std::string name = prefix + "  conv1d_dw" + bias_str + " [" +
+      std::to_string(config.N) + "," + std::to_string(config.C) + "," +
+      std::to_string(config.L) + "] K=" + std::to_string(config.K) +
+      " s=" + std::to_string(config.stride) +
+      " p=" + std::to_string(config.padding) +
+      " d=" + std::to_string(config.dilation) + "  " + storage_str + "(HP) " +
+      dtype_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv1d_dw.default");
+
+  // Input: [N, C, L] height-packed
+  ValueSpec input(
+      {config.N, config.C, config.L},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::RANDOM);
+  test_case.add_input_spec(input);
+
+  // Weight: [C, 1, K] height-packed, constant
+  ValueSpec weight(
+      {config.C, 1, config.K},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+  test_case.add_input_spec(weight);
+
+  // Bias: [C] or None
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.C},
+        dtype,
+        storage_type,
+        utils::kWidthPacked,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride
+  test_case.add_input_spec(
+      ValueSpec(std::vector<int32_t>{static_cast<int32_t>(config.stride)}));
+  // padding
+  test_case.add_input_spec(
+      ValueSpec(std::vector<int32_t>{static_cast<int32_t>(config.padding)}));
+  // dilation
+  test_case.add_input_spec(
+      ValueSpec(std::vector<int32_t>{static_cast<int32_t>(config.dilation)}));
+  // groups = C (depthwise)
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.C)));
+
+  // Output: [N, C, L_out] height-packed
+  ValueSpec output(
+      {config.N, config.C, L_out},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+static void conv1d_dw_reference_impl(TestCase& test_case) {
+  const auto& input_spec = test_case.inputs()[0];
+  const auto& weight_spec = test_case.inputs()[1];
+  const auto& bias_spec = test_case.inputs()[2];
+  const auto& stride_spec = test_case.inputs()[3];
+  const auto& padding_spec = test_case.inputs()[4];
+  const auto& dilation_spec = test_case.inputs()[5];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto in_sizes = input_spec.get_tensor_sizes();
+  auto w_sizes = weight_spec.get_tensor_sizes();
+  auto out_sizes = output.get_tensor_sizes();
+
+  const int64_t N = in_sizes[0];
+  const int64_t C = in_sizes[1];
+  const int64_t L_in = in_sizes[2];
+  const int64_t K = w_sizes[2];
+  const int64_t L_out = out_sizes[2];
+
+  const int64_t stride = stride_spec.get_int_list()[0];
+  const int64_t padding = padding_spec.get_int_list()[0];
+  const int64_t dilation = dilation_spec.get_int_list()[0];
+
+  const auto& in_data = input_spec.get_float_data();
+  const auto& w_data = weight_spec.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C * L_out, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t c = 0; c < C; ++c) {
+      for (int64_t l = 0; l < L_out; ++l) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          const int64_t l_in = l * stride - padding + k * dilation;
+          if (l_in >= 0 && l_in < L_in) {
+            sum += in_data[n * C * L_in + c * L_in + l_in] * w_data[c * K + k];
+          }
+        }
+        ref_data[n * C * L_out + c * L_out + l] = sum;
+      }
+    }
+  }
+
+  if (!bias_spec.is_none()) {
+    const auto& bias_data = bias_spec.get_float_data();
+    for (int64_t n = 0; n < N; ++n) {
+      for (int64_t c = 0; c < C; ++c) {
+        for (int64_t l = 0; l < L_out; ++l) {
+          ref_data[n * C * L_out + c * L_out + l] += bias_data[c];
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv1d_dw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
+
+  // Accuracy shapes
+  std::vector<Conv1dDWConfig> accu_configs = {
+      // {N, C, L, K, stride, padding, dilation, has_bias}
+      {1, 16, 64, 3, 1, 1, 1, false},
+      {1, 32, 128, 5, 1, 2, 1, true},
+      {1, 64, 32, 3, 2, 1, 1, false},
+      {2, 16, 64, 3, 1, 1, 1, true},
+      {1, 16, 64, 7, 1, 3, 2, false},
+      // Non-aligned channel counts (not a multiple of 4)
+      {1, 5, 64, 3, 1, 1, 1, false},
+      {1, 5, 64, 3, 1, 1, 1, true},
+      {1, 7, 32, 5, 1, 2, 1, false},
+      {1, 13, 48, 3, 2, 1, 1, true},
+      {2, 7, 64, 3, 1, 1, 1, false},
+  };
+
+  for (const auto& cfg : accu_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st));
+    }
+  }
+
+  // Performance shapes (half + float)
+  std::vector<Conv1dDWConfig> perf_configs = {
+      {1, 256, 1024, 3, 1, 1, 1, false},
+      {1, 512, 2048, 5, 1, 2, 1, true},
+      {1, 128, 4096, 31, 1, 15, 1, false},
+  };
+
+  for (const auto& cfg : perf_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st));
+      test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kHalf, st));
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv1d_dw_flop_calculator(const TestCase& test_case) {
+  auto in_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto w_sizes = test_case.inputs()[1].get_tensor_sizes();
+  auto out_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  const int64_t N = in_sizes[0];
+  const int64_t C = in_sizes[1];
+  const int64_t K = w_sizes[2];
+  const int64_t L_out = out_sizes[2];
+
+  return 2 * N * C * L_out * K;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Conv1d Depthwise (Height-Packed) Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = conv1d_dw_reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv1d_dw_test_cases,
+      conv1d_dw_flop_calculator,
+      "Conv1dDW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}

From e6b19ebd9ad139aa953d58920ec6da786aff7c80 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Fri, 27 Mar 2026 11:22:01 -0700
Subject: [PATCH 2/2] [ET-VK][conv1d] Route conv1d to height-packed
 implementations in export pipeline

Pull Request resolved: https://github.com/pytorch/executorch/pull/18334

Integrate the new height-packed conv1d_pw and conv1d_dw operators into the
aten.convolution.default dispatch path so they are automatically used during
model export.

In op_registry.py, add a pick_conv_storage function that inspects the
convolution node at partition time. For 1D convolutions where the op is
pointwise (kernel_size=1) or depthwise (groups=C_in) and channels are 4-aligned,
it selects HEIGHT_PACKED_TEXTURE for input/output instead of the default
CHANNELS_PACKED_TEXTURE. All other cases (conv2d, grouped conv1d with K>1,
unaligned channels) retain channels-packed behavior.

In Convolution.cpp, add a height-packed routing block at the top of the conv1d
path. When the input tensor is height-packed, it dispatches to
et_vk.conv1d_pw.default or et_vk.conv1d_dw.default via VK_GET_OP_FN. Falls
through to the existing channels-packed add_conv1d_node path otherwise.
ghstack-source-id: 358903217
@exported-using-ghexport

Differential Revision: [D97344090](https://our.internmc.facebook.com/intern/diff/D97344090/)
---
 backends/vulkan/op_registry.py                | 42 ++++++++++++++++
 .../runtime/graph/ops/impl/Convolution.cpp    | 50 +++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index ddb843e2335..38215c2d827 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -802,6 +802,47 @@ def check_conv_node(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_conv_storage(
+        node: torch.fx.Node,
+    ) -> Tuple[List[utils.TensorRepSet], utils.TensorRepSet]:
+        x = node.args[0]
+        assert isinstance(x, torch.fx.Node)
+        x_shape = x.meta["val"].size()
+
+        # Default: channels-packed texture (conv2d and fallback conv1d)
+        input_storage = utils.CHANNELS_PACKED_TEXTURE
+        output_storage = utils.CHANNELS_PACKED_TEXTURE
+
+        if len(x_shape) == 3:
+            # Conv1d: check if we can use height-packed
+            weight = node.args[1]
+            assert isinstance(weight, torch.fx.Node)
+            w_shape = weight.meta["val"].size()
+            groups = node.args[8]
+
+            c_in = x_shape[1]
+            c_out = w_shape[0]
+            kernel_size = w_shape[2]
+
+            is_pointwise = kernel_size == 1
+            is_depthwise = (
+                isinstance(groups, int)
+                and groups == c_in
+                and c_out == c_in
+                and w_shape[1] == 1
+            )
+            if is_pointwise or is_depthwise:
+                input_storage = utils.HEIGHT_PACKED_TEXTURE
+                output_storage = utils.HEIGHT_PACKED_TEXTURE
+
+        # Build per-input storage list. The convolution op has variable args:
+        # aten.convolution.default: input, weight, bias, stride, padding,
+        #   dilation, transposed, output_padding, groups
+        # et_vk.conv_with_clamp.default: + output_min, output_max
+        # All args after input are NO_STORAGE (prepacked or non-tensor)
+        inputs = [input_storage] + [utils.NO_STORAGE] * 10
+        return inputs, output_storage
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -820,6 +861,7 @@ def check_conv_node(node: torch.fx.Node) -> bool:
         supports_resize=True,
         supports_prepacking=True,
         are_node_inputs_supported_fn=check_conv_node,
+        pick_io_storage_fn=pick_conv_storage,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 2da98926fad..9c518678502 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -686,6 +686,56 @@ void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
           true);
     }
   } else {
+    // Conv1d path
+    if (graph.packed_dim_of(args[0]) == WHCN::kHeightDim) {
+      // Height-packed: route to optimized conv1d implementations
+      const auto weight_sizes = graph.sizes_of(args[1]);
+      const int64_t groups_val = graph.get_int(args[8]);
+      const bool is_pointwise = weight_sizes.at(2) == 1;
+      const bool is_depthwise =
+          groups_val == weight_sizes.at(0) && weight_sizes.at(1) == 1;
+
+      // Build unified 10-arg vector:
+      //   in, weight, bias, stride, padding, dilation, groups,
+      //   output_min, output_max, out
+      // For non-clamp (args.size() == 10): output_min/max = kDummyValueRef
+      // For clamp (args.size() == 12): output_min/max from args[9]/args[10]
+      ValueRef output_min = kDummyValueRef;
+      ValueRef output_max = kDummyValueRef;
+      ValueRef out;
+      if (args.size() == 10) {
+        out = args[9];
+      } else {
+        output_min = args[9];
+        output_max = args[10];
+        out = args[11];
+      }
+
+      std::vector<ValueRef> conv1d_args = {
+          args[0],
+          args[1],
+          args[2],
+          args[3],
+          args[4],
+          args[5],
+          args[8],
+          output_min,
+          output_max,
+          out};
+
+      if (is_pointwise) {
+        VK_GET_OP_FN("et_vk.conv1d_pw.default")(graph, conv1d_args);
+      } else if (is_depthwise) {
+        VK_GET_OP_FN("et_vk.conv1d_dw.default")(graph, conv1d_args);
+      } else {
+        VK_THROW(
+            "Height-packed conv1d only supports pointwise (K=1) or "
+            "depthwise (groups=C)");
+      }
+      return;
+    }
+
+    // Existing channels-packed fallback
     if (args.size() == 10) {
       // ordinary conv1d
       return add_conv1d_node(