From febb5806fc5a876f11224ecb53926ee666244f4d Mon Sep 17 00:00:00 2001 From: ssjia Date: Fri, 27 Mar 2026 11:21:59 -0700 Subject: [PATCH 1/2] [ET-VK][conv1d] Implement height-packed depthwise conv1d operator Pull Request resolved: https://github.com/pytorch/executorch/pull/18333 Implement a depthwise conv1d operator using height-packed layout where channels are the packed dimension (WHCN dim 1). Depthwise conv applies a separate filter to each channel independently (groups=C), so 4 channels can be processed in parallel using element-wise vec4 FMA over kernel positions. Thread mapping: X=C/4, Y=L_out, Z=N. Each thread computes one output texel (4 channels at one spatial position). Inner loop iterates over kernel positions K with bounds-checked input access for padding. Weight [C,1,K] is prepacked as channels-packed so each vec4 load gives 4 channels' weights at one kernel position. Supports both buffer and texture3d storage, fp32/fp16, optional bias, and arbitrary stride/padding/dilation. Registered as et_vk.conv1d_dw.default (standalone custom op). Performance on Adreno 750 (S24): - [1,128,4096] K=31 buffer f16: 231 GFLOP/s - [1,128,4096] K=31 buffer f32: 155 GFLOP/s - [1,512,2048] K=5 buffer f32: 66 GFLOP/s ghstack-source-id: 358903219 @exported-using-ghexport Differential Revision: [D97344091](https://our.internmc.facebook.com/intern/diff/D97344091/) --- .../runtime/graph/ops/glsl/conv1d_dw.glsl | 127 +++++++++ .../runtime/graph/ops/glsl/conv1d_dw.yaml | 22 ++ .../runtime/graph/ops/impl/Conv1dDW.cpp | 188 ++++++++++++ .../test/custom_ops/impl/TestConv1dDW.cpp | 46 +++ backends/vulkan/test/custom_ops/targets.bzl | 1 + .../vulkan/test/custom_ops/test_conv1d_dw.cpp | 267 ++++++++++++++++++ 6 files changed, 651 insertions(+) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp create mode 100644 backends/vulkan/test/custom_ops/test_conv1d_dw.cpp diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl new file mode 100644 index 00000000000..7ea068af93c --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl @@ -0,0 +1,127 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define T ${texel_load_component_type(DTYPE, STORAGE)} + +$if STORAGE == "buffer": + #define BUFFER + #define SCALAR_BUFFER +$if HAS_BIAS: + #define HAS_BIAS + +${define_required_extensions(STORAGE, DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +$if STORAGE == "buffer": + ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=True)} + ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=True)} + ${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE, is_scalar_array=True)} +$else: + ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)} + ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)} + ${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE, is_scalar_array=False)} +$if HAS_BIAS: + $if STORAGE == "buffer": + ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=True)} + $else: + ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=False)} + +// in_sizes: {L_in, C, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "in_sizes")} +// out_sizes: {L_out, C, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +layout(push_constant) uniform restrict Block { + int kernel_size; + int stride; + int padding; + int dilation; + float output_min; + float output_max; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Thread mapping: X = C/4, Y = L_out, Z = N +// Each thread computes 4 output channels at one spatial position. +// Depthwise: each channel has its own filter, so 4 channels can be computed +// independently with element-wise vec4 FMA. + +void main() { + const int c4 = int(gl_GlobalInvocationID.x); + const int l_out = int(gl_GlobalInvocationID.y); + const int n = int(gl_GlobalInvocationID.z); + + const int L_in = in_sizes.x; + const int C = in_sizes.y; + const int C4 = div_up_4(C); + const int L_out = out_sizes.x; + + if (c4 >= C4 || l_out >= L_out) { + return; + } + + VEC4_T sum = VEC4_T(0); + + for (int k = 0; k < kernel_size; k++) { + const int l_in = l_out * stride - padding + k * dilation; + if (l_in >= 0 && l_in < L_in) { +#ifdef BUFFER + const int in_base = (n * L_in + l_in) * C + c4 * 4; + T in_s0 = t_in[in_base]; + T in_s1 = (c4 * 4 + 1 < C) ? t_in[in_base + 1] : T(0); + T in_s2 = (c4 * 4 + 2 < C) ? t_in[in_base + 2] : T(0); + T in_s3 = (c4 * 4 + 3 < C) ? t_in[in_base + 3] : T(0); + const VEC4_T in_val = VEC4_T(in_s0, in_s1, in_s2, in_s3); + + const int w_base = k * C + c4 * 4; + T w_s0 = t_weight[w_base]; + T w_s1 = (c4 * 4 + 1 < C) ? t_weight[w_base + 1] : T(0); + T w_s2 = (c4 * 4 + 2 < C) ? t_weight[w_base + 2] : T(0); + T w_s3 = (c4 * 4 + 3 < C) ? t_weight[w_base + 3] : T(0); + const VEC4_T w_val = VEC4_T(w_s0, w_s1, w_s2, w_s3); +#else + const VEC4_T in_val = texelFetch(t_in, ivec3(l_in, c4, n), 0); + const VEC4_T w_val = texelFetch(t_weight, ivec3(k, 0, c4), 0); +#endif + sum = fma(w_val, in_val, sum); + } + } + +#ifdef HAS_BIAS +#ifdef BUFFER + const int bias_base = c4 * 4; + T b0 = t_bias[bias_base]; + T b1 = (bias_base + 1 < C) ? t_bias[bias_base + 1] : T(0); + T b2 = (bias_base + 2 < C) ? t_bias[bias_base + 2] : T(0); + T b3 = (bias_base + 3 < C) ? t_bias[bias_base + 3] : T(0); + sum += VEC4_T(b0, b1, b2, b3); +#else + sum += texelFetch(t_bias, ivec3(c4, 0, 0), 0); +#endif +#endif + + sum = clamp(sum, VEC4_T(output_min), VEC4_T(output_max)); + +#ifdef BUFFER + const int out_base = (n * L_out + l_out) * C + c4 * 4; + t_out[out_base] = sum.x; + if (c4 * 4 + 1 < C) t_out[out_base + 1] = sum.y; + if (c4 * 4 + 2 < C) t_out[out_base + 2] = sum.z; + if (c4 * 4 + 3 < C) t_out[out_base + 3] = sum.w; +#else + imageStore(t_out, ivec3(l_out, c4, n), sum); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml new file mode 100644 index 00000000000..883ad8899ea --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv1d_dw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + HAS_BIAS: false + generate_variant_forall: + STORAGE: + - VALUE: texture3d + - VALUE: buffer + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: conv1d_dw + - NAME: conv1d_dw_bias + HAS_BIAS: true diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp new file mode 100644 index 00000000000..88d421e6994 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dDW.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace vkcompute { + +void resize_conv1d_dw_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef self = args.at(1).refs.at(0); + + TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0)); + + const int64_t stride = graph->get_int_list(extra_args.at(1))->at(0); + const int64_t padding = graph->get_int_list(extra_args.at(2))->at(0); + const int64_t dilation = graph->get_int_list(extra_args.at(3))->at(0); + + const std::vector in_sizes = graph->sizes_of(self); + const int64_t kernel_size = weight_ref->sizes.at(2); + const int64_t L_in = in_sizes.at(2); + + const int64_t L_out = + calc_out_size(L_in, kernel_size, stride, padding, dilation, false); + + graph->virtual_resize(out, {in_sizes.at(0), in_sizes.at(1), L_out}); +} + +struct Conv1dDWParams final { + int32_t kernel_size; + int32_t stride; + int32_t padding; + int32_t dilation; +}; + +struct Conv1dDWClampParams final { + float output_min; + float output_max; +}; + +utils::uvec3 pick_conv1d_dw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + // out is [N, C, L_out]; in WHCN: {L_out, C, N, 1} + const uint32_t C = graph->size_at(-2, out); + const uint32_t L_out = graph->size_at(-1, out); + const uint32_t N = + graph->dim_of(out) >= 3 ? graph->size_at(-3, out) : 1; + + return {utils::div_up_4(C), L_out, N}; +} + +void add_conv1d_dw_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride_ref, + const ValueRef padding_ref, + const ValueRef dilation_ref, + const ValueRef out, + const float output_min = std::numeric_limits::lowest(), + const float output_max = std::numeric_limits::max()) { + VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim); + + const utils::StorageType storage_type = graph.storage_type_of(out); + + // Weight [C, 1, K] prepacked as channels-packed so each vec4 load gives + // 4 channels at one kernel position. + ValueRef packed_weight = prepack_standard( + graph, weight_data, storage_type, utils::kChannelsPacked); + + bool has_bias = graph.val_is_not_none(bias); + ValueRef packed_bias = kDummyValueRef; + if (has_bias) { + packed_bias = + prepack_standard(graph, bias, storage_type, utils::kWidthPacked); + } + + const auto stride_val = graph.get_int_list(stride_ref)->at(0); + const auto padding_val = graph.get_int_list(padding_ref)->at(0); + const auto dilation_val = graph.get_int_list(dilation_ref)->at(0); + + Conv1dDWParams params{ + utils::safe_downcast(graph.get_tref(weight_data)->sizes.at(2)), + utils::safe_downcast(stride_val), + utils::safe_downcast(padding_val), + utils::safe_downcast(dilation_val), + }; + + Conv1dDWClampParams clamp_params{ + output_min, + output_max, + }; + + std::string kernel_name = has_bias ? "conv1d_dw_bias" : "conv1d_dw"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, storage_type); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + std::vector read_inputs = {in, packed_weight}; + if (has_bias) { + read_inputs.push_back(packed_bias); + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_conv1d_dw_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}}, + // Shader params buffers + {graph.sizes_ubo(in), graph.sizes_ubo(out)}, + // Push Constants + {PushConstantDataInfo(¶ms, sizeof(Conv1dDWParams)), + PushConstantDataInfo(&clamp_params, sizeof(Conv1dDWClampParams))}, + // Specialization Constants + {}, + // Resize Args + {weight_data, stride_ref, padding_ref, dilation_ref}, + // Resizing Logic + resize_conv1d_dw_node)); +} + +// Args: in, weight, bias, stride, padding, dilation, groups, +// output_min, output_max, out +// output_min and output_max may be kDummyValueRef (no clamp). +void conv1d_dw(ComputeGraph& graph, const std::vector& args) { + ValueRef in = args[0]; + ValueRef weight = args[1]; + ValueRef bias = args[2]; + ValueRef stride = args[3]; + ValueRef padding = args[4]; + ValueRef dilation = args[5]; + ValueRef out = args[9]; + + float output_min = std::numeric_limits::lowest(); + float output_max = std::numeric_limits::max(); + if (is_valid(args[7])) { + output_min = graph.extract_scalar(args[7]); + } + if (is_valid(args[8])) { + output_max = graph.extract_scalar(args[8]); + } + + add_conv1d_dw_node( + graph, + in, + weight, + bias, + stride, + padding, + dilation, + out, + output_min, + output_max); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.conv1d_dw.default, conv1d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp new file mode 100644 index 00000000000..15923462a20 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv1dDW.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace vkcompute { + +void test_conv1d_dw(ComputeGraph& graph, const std::vector& args) { + // args: in, weight, bias, stride, padding, dilation, groups, out + const ValueRef input = args.at(0); + const ValueRef weight = args.at(1); + const ValueRef bias = args.at(2); + const ValueRef stride = args.at(3); + const ValueRef padding = args.at(4); + const ValueRef dilation = args.at(5); + const ValueRef groups = args.at(6); + const ValueRef out = args.at(7); + + // conv1d_dw expects: in, weight, bias, stride, padding, dilation, groups, + // output_min, output_max, out + VK_GET_OP_FN("et_vk.conv1d_dw.default") + (graph, + {input, + weight, + bias, + stride, + padding, + dilation, + groups, + kDummyValueRef, + kDummyValueRef, + out}); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv1d_dw.default, test_conv1d_dw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index d535ca2661c..5fb0f7f4cbf 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -104,3 +104,4 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_conv2d_dw") define_custom_op_test_binary("test_embedding_q4gsw") define_custom_op_test_binary("test_conv1d_pw") + define_custom_op_test_binary("test_conv1d_dw") diff --git a/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp new file mode 100644 index 00000000000..2438847036e --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv1d_dw.cpp @@ -0,0 +1,267 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 256; + +struct Conv1dDWConfig { + int64_t N; + int64_t C; + int64_t L; + int64_t K; + int64_t stride; + int64_t padding; + int64_t dilation; + bool has_bias; +}; + +static TestCase create_conv1d_dw_test_case( + const Conv1dDWConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type) { + TestCase test_case; + + bool is_perf = config.C > kRefDimSizeLimit || config.L > kRefDimSizeLimit; + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = storage_type_abbrev(storage_type); + std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32"; + std::string bias_str = config.has_bias ? "+bias" : ""; + + int64_t L_out = + (config.L + 2 * config.padding - config.dilation * (config.K - 1) - 1) / + config.stride + + 1; + + std::string name = prefix + " conv1d_dw" + bias_str + " [" + + std::to_string(config.N) + "," + std::to_string(config.C) + "," + + std::to_string(config.L) + "] K=" + std::to_string(config.K) + + " s=" + std::to_string(config.stride) + + " p=" + std::to_string(config.padding) + + " d=" + std::to_string(config.dilation) + " " + storage_str + "(HP) " + + dtype_str; + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv1d_dw.default"); + + // Input: [N, C, L] height-packed + ValueSpec input( + {config.N, config.C, config.L}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + test_case.add_input_spec(input); + + // Weight: [C, 1, K] height-packed, constant + ValueSpec weight( + {config.C, 1, config.K}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + weight.set_constant(true); + test_case.add_input_spec(weight); + + // Bias: [C] or None + if (config.has_bias) { + ValueSpec bias( + {config.C}, + dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // stride + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.stride)})); + // padding + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.padding)})); + // dilation + test_case.add_input_spec( + ValueSpec(std::vector{static_cast(config.dilation)})); + // groups = C (depthwise) + test_case.add_input_spec(ValueSpec(static_cast(config.C))); + + // Output: [N, C, L_out] height-packed + ValueSpec output( + {config.N, config.C, L_out}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +static void conv1d_dw_reference_impl(TestCase& test_case) { + const auto& input_spec = test_case.inputs()[0]; + const auto& weight_spec = test_case.inputs()[1]; + const auto& bias_spec = test_case.inputs()[2]; + const auto& stride_spec = test_case.inputs()[3]; + const auto& padding_spec = test_case.inputs()[4]; + const auto& dilation_spec = test_case.inputs()[5]; + ValueSpec& output = test_case.outputs()[0]; + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Reference only supports float"); + } + + auto in_sizes = input_spec.get_tensor_sizes(); + auto w_sizes = weight_spec.get_tensor_sizes(); + auto out_sizes = output.get_tensor_sizes(); + + const int64_t N = in_sizes[0]; + const int64_t C = in_sizes[1]; + const int64_t L_in = in_sizes[2]; + const int64_t K = w_sizes[2]; + const int64_t L_out = out_sizes[2]; + + const int64_t stride = stride_spec.get_int_list()[0]; + const int64_t padding = padding_spec.get_int_list()[0]; + const int64_t dilation = dilation_spec.get_int_list()[0]; + + const auto& in_data = input_spec.get_float_data(); + const auto& w_data = weight_spec.get_float_data(); + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C * L_out, 0.0f); + + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C; ++c) { + for (int64_t l = 0; l < L_out; ++l) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + const int64_t l_in = l * stride - padding + k * dilation; + if (l_in >= 0 && l_in < L_in) { + sum += in_data[n * C * L_in + c * L_in + l_in] * w_data[c * K + k]; + } + } + ref_data[n * C * L_out + c * L_out + l] = sum; + } + } + } + + if (!bias_spec.is_none()) { + const auto& bias_data = bias_spec.get_float_data(); + for (int64_t n = 0; n < N; ++n) { + for (int64_t c = 0; c < C; ++c) { + for (int64_t l = 0; l < L_out; ++l) { + ref_data[n * C * L_out + c * L_out + l] += bias_data[c]; + } + } + } + } +} + +static std::vector generate_conv1d_dw_test_cases() { + std::vector test_cases; + + std::vector storage_types = { + utils::kTexture3D, utils::kBuffer}; + + // Accuracy shapes + std::vector accu_configs = { + // {N, C, L, K, stride, padding, dilation, has_bias} + {1, 16, 64, 3, 1, 1, 1, false}, + {1, 32, 128, 5, 1, 2, 1, true}, + {1, 64, 32, 3, 2, 1, 1, false}, + {2, 16, 64, 3, 1, 1, 1, true}, + {1, 16, 64, 7, 1, 3, 2, false}, + // Non-aligned channel counts (not a multiple of 4) + {1, 5, 64, 3, 1, 1, 1, false}, + {1, 5, 64, 3, 1, 1, 1, true}, + {1, 7, 32, 5, 1, 2, 1, false}, + {1, 13, 48, 3, 2, 1, 1, true}, + {2, 7, 64, 3, 1, 1, 1, false}, + }; + + for (const auto& cfg : accu_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st)); + } + } + + // Performance shapes (half + float) + std::vector perf_configs = { + {1, 256, 1024, 3, 1, 1, 1, false}, + {1, 512, 2048, 5, 1, 2, 1, true}, + {1, 128, 4096, 31, 1, 15, 1, false}, + }; + + for (const auto& cfg : perf_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kFloat, st)); + test_cases.push_back(create_conv1d_dw_test_case(cfg, vkapi::kHalf, st)); + } + } + + return test_cases; +} + +static int64_t conv1d_dw_flop_calculator(const TestCase& test_case) { + auto in_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto w_sizes = test_case.inputs()[1].get_tensor_sizes(); + auto out_sizes = test_case.outputs()[0].get_tensor_sizes(); + + const int64_t N = in_sizes[0]; + const int64_t C = in_sizes[1]; + const int64_t K = w_sizes[2]; + const int64_t L_out = out_sizes[2]; + + return 2 * N * C * L_out * K; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Conv1d Depthwise (Height-Packed) Benchmark" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = conv1d_dw_reference_impl; + + auto results = execute_test_cases( + generate_conv1d_dw_test_cases, + conv1d_dw_flop_calculator, + "Conv1dDW", + 3, + 10, + ref_fn); + + return 0; +} From e6b19ebd9ad139aa953d58920ec6da786aff7c80 Mon Sep 17 00:00:00 2001 From: ssjia Date: Fri, 27 Mar 2026 11:22:01 -0700 Subject: [PATCH 2/2] [ET-VK][conv1d] Route conv1d to height-packed implementations in export pipeline Pull Request resolved: https://github.com/pytorch/executorch/pull/18334 Integrate the new height-packed conv1d_pw and conv1d_dw operators into the aten.convolution.default dispatch path so they are automatically used during model export. In op_registry.py, add a pick_conv_storage function that inspects the convolution node at partition time. For 1D convolutions where the op is pointwise (kernel_size=1) or depthwise (groups=C_in) and channels are 4-aligned, it selects HEIGHT_PACKED_TEXTURE for input/output instead of the default CHANNELS_PACKED_TEXTURE. All other cases (conv2d, grouped conv1d with K>1, unaligned channels) retain channels-packed behavior. In Convolution.cpp, add a height-packed routing block at the top of the conv1d path. When the input tensor is height-packed, it dispatches to et_vk.conv1d_pw.default or et_vk.conv1d_dw.default via VK_GET_OP_FN. Falls through to the existing channels-packed add_conv1d_node path otherwise. ghstack-source-id: 358903217 @exported-using-ghexport Differential Revision: [D97344090](https://our.internmc.facebook.com/intern/diff/D97344090/) --- backends/vulkan/op_registry.py | 42 ++++++++++++++++ .../runtime/graph/ops/impl/Convolution.cpp | 50 +++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index ddb843e2335..38215c2d827 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -802,6 +802,47 @@ def check_conv_node(node: torch.fx.Node) -> bool: return True + def pick_conv_storage( + node: torch.fx.Node, + ) -> Tuple[List[utils.TensorRepSet], utils.TensorRepSet]: + x = node.args[0] + assert isinstance(x, torch.fx.Node) + x_shape = x.meta["val"].size() + + # Default: channels-packed texture (conv2d and fallback conv1d) + input_storage = utils.CHANNELS_PACKED_TEXTURE + output_storage = utils.CHANNELS_PACKED_TEXTURE + + if len(x_shape) == 3: + # Conv1d: check if we can use height-packed + weight = node.args[1] + assert isinstance(weight, torch.fx.Node) + w_shape = weight.meta["val"].size() + groups = node.args[8] + + c_in = x_shape[1] + c_out = w_shape[0] + kernel_size = w_shape[2] + + is_pointwise = kernel_size == 1 + is_depthwise = ( + isinstance(groups, int) + and groups == c_in + and c_out == c_in + and w_shape[1] == 1 + ) + if is_pointwise or is_depthwise: + input_storage = utils.HEIGHT_PACKED_TEXTURE + output_storage = utils.HEIGHT_PACKED_TEXTURE + + # Build per-input storage list. The convolution op has variable args: + # aten.convolution.default: input, weight, bias, stride, padding, + # dilation, transposed, output_padding, groups + # et_vk.conv_with_clamp.default: + output_min, output_max + # All args after input are NO_STORAGE (prepacked or non-tensor) + inputs = [input_storage] + [utils.NO_STORAGE] * 10 + return inputs, output_storage + return OpFeatures( inputs_storage=[ utils.CHANNELS_PACKED_TEXTURE, # input @@ -820,6 +861,7 @@ def check_conv_node(node: torch.fx.Node) -> bool: supports_resize=True, supports_prepacking=True, are_node_inputs_supported_fn=check_conv_node, + pick_io_storage_fn=pick_conv_storage, ) diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 2da98926fad..9c518678502 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -686,6 +686,56 @@ void conv(ComputeGraph& graph, const std::vector& args) { true); } } else { + // Conv1d path + if (graph.packed_dim_of(args[0]) == WHCN::kHeightDim) { + // Height-packed: route to optimized conv1d implementations + const auto weight_sizes = graph.sizes_of(args[1]); + const int64_t groups_val = graph.get_int(args[8]); + const bool is_pointwise = weight_sizes.at(2) == 1; + const bool is_depthwise = + groups_val == weight_sizes.at(0) && weight_sizes.at(1) == 1; + + // Build unified 10-arg vector: + // in, weight, bias, stride, padding, dilation, groups, + // output_min, output_max, out + // For non-clamp (args.size() == 10): output_min/max = kDummyValueRef + // For clamp (args.size() == 12): output_min/max from args[9]/args[10] + ValueRef output_min = kDummyValueRef; + ValueRef output_max = kDummyValueRef; + ValueRef out; + if (args.size() == 10) { + out = args[9]; + } else { + output_min = args[9]; + output_max = args[10]; + out = args[11]; + } + + std::vector conv1d_args = { + args[0], + args[1], + args[2], + args[3], + args[4], + args[5], + args[8], + output_min, + output_max, + out}; + + if (is_pointwise) { + VK_GET_OP_FN("et_vk.conv1d_pw.default")(graph, conv1d_args); + } else if (is_depthwise) { + VK_GET_OP_FN("et_vk.conv1d_dw.default")(graph, conv1d_args); + } else { + VK_THROW( + "Height-packed conv1d only supports pointwise (K=1) or " + "depthwise (groups=C)"); + } + return; + } + + // Existing channels-packed fallback if (args.size() == 10) { // ordinary conv1d return add_conv1d_node(