From 0ad10b5d73f28212abdd70236c9043f0a8a36fec Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Sat, 4 Apr 2026 23:42:44 +0200 Subject: [PATCH 1/2] Optimize naive Conv4D with flat contiguous buffers --- include/layers/ConvLayer.hpp | 182 +++++++++++++++++------------------ 1 file changed, 90 insertions(+), 92 deletions(-) diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp index db51dda70..64c5e338d 100644 --- a/include/layers/ConvLayer.hpp +++ b/include/layers/ConvLayer.hpp @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include @@ -34,7 +35,7 @@ class ConvolutionalLayer : public Layer { dilations_ = 0; } ConvolutionalLayer(size_t step, size_t pads, size_t dilations, - const Tensor& kernel, const Tensor& bias = Tensor(), + const Tensor &kernel, const Tensor &bias = Tensor(), size_t group = 1, bool useLegacyImpl = false) : Layer(kConvolution), kernel_(std::make_shared(kernel)), @@ -73,10 +74,10 @@ class ConvolutionalLayer : public Layer { return useLegacyImpl_; } - void run(const std::vector& input, - std::vector& output) override; - void run(const std::vector& input, std::vector& output, - const RuntimeOptions& options) override; + void run(const std::vector &input, + std::vector &output) override; + void run(const std::vector &input, std::vector &output, + const RuntimeOptions &options) override; #ifdef ENABLE_STATISTIC_WEIGHTS Tensor get_weights() override { return *kernel_; @@ -100,7 +101,7 @@ class ConvImpl : public LayerImpl { ConvImpl() = delete; ConvImpl(size_t stride, size_t pads, size_t dilations, int input_width, int input_height, int input_flow, size_t input_size, - const std::vector& bias) + const std::vector &bias) : input_width_(input_width), input_height_(input_height), input_flow_(input_flow), @@ -110,10 +111,10 @@ class ConvImpl : public LayerImpl { input_size_(input_size), bias_(bias) {} - ConvImpl(const ConvImpl& c) = default; + ConvImpl(const ConvImpl &c) = default; [[nodiscard]] std::vector run( - const std::vector& input) const override { + const std::vector &input) const override { return input; } @@ -184,8 +185,8 @@ class ConvImpl : public LayerImpl { // NCHW -> NCHW only template -void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, - Tensor& output, size_t stride_, size_t pads_, size_t group_, +void Conv4D(const Tensor &input, const Tensor &kernel_, const Tensor &bias_, + Tensor &output, size_t stride_, size_t pads_, size_t group_, size_t dilations_, ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; size_t in_channels = input.get_shape()[1]; @@ -212,106 +213,103 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, size_t out_width = ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_); - std::vector>>> padded_input( - batch_size, - std::vector>>( - in_height + 2 * pads_, - std::vector>( - in_width + 2 * pads_, std::vector(in_channels, 0)))); - parallel::Options options; options.backend = backend; - parallel::parallel_for(batch_size, [&](size_t b) { - for (size_t h = 0; h < in_height; ++h) { - for (size_t w = 0; w < in_width; ++w) { - for (size_t c = 0; c < in_channels; ++c) { - padded_input[b][h + pads_][w + pads_][c] = - input.get({b, c, h, w}); - } - } - } - }, options); - - size_t dilated_kernel_height = (kernel_height - 1) * dilations_ + 1; - size_t dilated_kernel_width = (kernel_width - 1) * dilations_ + 1; - - std::vector>>> dil_kernel( - out_channels, std::vector>>( - kernel_in_channels, - std::vector>( - dilated_kernel_height, - std::vector(dilated_kernel_width, 0)))); - - parallel::parallel_for(out_channels, [&](size_t oc) { - for (size_t ic = 0; ic < kernel_in_channels; ++ic) { - for (size_t kh = 0; kh < kernel_height; ++kh) { - for (size_t kw = 0; kw < kernel_width; ++kw) { - dil_kernel[oc][ic][kh * dilations_][kw * dilations_] = - kernel_.get({oc, ic, kh, kw}); - } - } - } - }, options); + const auto &input_data = *input.as(); + const auto &kernel_data = *kernel_.as(); + const std::vector *bias_data = nullptr; + if (!bias_.empty()) { + bias_data = bias_.as(); + } - std::vector>>> output_tensor( - batch_size, - std::vector>>( - out_channels, std::vector>( - out_height, std::vector(out_width, 0)))); + const size_t input_channel_stride = in_height * in_width; + const size_t input_batch_stride = in_channels * input_channel_stride; + const size_t kernel_channel_stride = kernel_height * kernel_width; + const size_t kernel_output_stride = + kernel_in_channels * kernel_channel_stride; + const size_t output_channel_stride = out_height * out_width; + const size_t output_batch_stride = out_channels * output_channel_stride; + const size_t in_channels_per_group = in_channels / group_; + const size_t out_channels_per_group = out_channels / group_; + const bool collapsed_kernel = dilations_ == 0; + Shape output_shape({batch_size, out_channels, out_height, out_width}); + std::vector flat_output(output_shape.count(), 0); size_t total_work = batch_size * out_channels; parallel::parallel_for(total_work, [&](size_t idx) { size_t b = idx / out_channels; size_t oc = idx % out_channels; + size_t input_batch_base = b * input_batch_stride; + size_t output_base = b * output_batch_stride + oc * output_channel_stride; + size_t group = (group_ > 1) ? oc / out_channels_per_group : 0; + size_t group_start_channel = group * in_channels_per_group; + size_t group_end_channel = group_start_channel + in_channels_per_group; + size_t kernel_oc_base = oc * kernel_output_stride; + ValueType bias_value = ValueType{}; + if (bias_data != nullptr && oc < bias_data->size()) { + bias_value = (*bias_data)[oc]; + } for (size_t oh = 0; oh < out_height; ++oh) { + std::ptrdiff_t input_h_base = static_cast(oh * stride_) - + static_cast(pads_); for (size_t ow = 0; ow < out_width; ++ow) { - ValueType value = 0; - size_t h_start = oh * stride_; - size_t w_start = ow * stride_; - - size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0; - size_t group_start_channel = group * (in_channels / group_); - size_t group_end_channel = (group + 1) * (in_channels / group_); + ValueType value = bias_value; + std::ptrdiff_t input_w_base = + static_cast(ow * stride_) - + static_cast(pads_); + size_t output_idx = output_base + oh * out_width + ow; for (size_t ic = group_start_channel; ic < group_end_channel; ++ic) { size_t kernel_ic = ic - group_start_channel; - - for (size_t kh = 0; kh < dilated_kernel_height; ++kh) { - for (size_t kw = 0; kw < dilated_kernel_width; ++kw) { - size_t h_index = h_start + kh; - size_t w_index = w_start + kw; - - if (h_index < padded_input[b].size() && - w_index < padded_input[b][h_index].size()) { - value += padded_input[b][h_index][w_index][ic] * - dil_kernel[oc][kernel_ic][kh][kw]; - } + size_t input_channel_base = + input_batch_base + ic * input_channel_stride; + size_t kernel_ic_base = + kernel_oc_base + kernel_ic * kernel_channel_stride; + + if (collapsed_kernel) { + if (input_h_base >= 0 && + input_h_base < static_cast(in_height) && + input_w_base >= 0 && + input_w_base < static_cast(in_width)) { + size_t input_idx = input_channel_base + + static_cast(input_h_base) * in_width + + static_cast(input_w_base); + size_t kernel_idx = kernel_ic_base + kernel_channel_stride - 1; + value += input_data[input_idx] * kernel_data[kernel_idx]; } + continue; } - } - if (!bias_.empty() && oc < bias_.get_shape()[0]) { - value += bias_.get({oc}); - } + for (size_t kh = 0; kh < kernel_height; ++kh) { + std::ptrdiff_t input_h = + input_h_base + static_cast(kh * dilations_); + if (input_h < 0 || + input_h >= static_cast(in_height)) { + continue; + } - output_tensor[b][oc][oh][ow] = value; - } - } - }, options); + size_t input_row_base = + input_channel_base + static_cast(input_h) * in_width; + size_t kernel_row_base = kernel_ic_base + kh * kernel_width; - Shape output_shape({batch_size, out_channels, out_height, out_width}); - std::vector flat_output(batch_size * out_channels * out_height * - out_width); + for (size_t kw = 0; kw < kernel_width; ++kw) { + std::ptrdiff_t input_w = + input_w_base + static_cast(kw * dilations_); + if (input_w < 0 || + input_w >= static_cast(in_width)) { + continue; + } - parallel::parallel_for(batch_size, [&](size_t b) { - size_t base_idx = b * out_channels * out_height * out_width; - for (size_t oc = 0; oc < out_channels; ++oc) { - for (size_t h = 0; h < out_height; ++h) { - for (size_t w = 0; w < out_width; ++w) { - flat_output[base_idx++] = output_tensor[b][oc][h][w]; + value += + input_data[input_row_base + static_cast(input_w)] * + kernel_data[kernel_row_base + kw]; + } + } } + + flat_output[output_idx] = value; } } }, options); @@ -320,8 +318,8 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, } template -void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_, - const Tensor& bias_, Tensor& output, size_t stride_, +void DepthwiseConv4D(const Tensor &input, const Tensor &kernel_, + const Tensor &bias_, Tensor &output, size_t stride_, size_t pads_, size_t dilations_, ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; @@ -388,8 +386,8 @@ void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_, // NCHW -> NCHW only (Legacy version) template -void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_, - const Tensor& bias_, Tensor& output, size_t stride_, +void Conv4D_Legacy(const Tensor &input, const Tensor &kernel_, + const Tensor &bias_, Tensor &output, size_t stride_, size_t pads_, size_t dilations_, ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; From ea25dd94e4b31bdef85b6d0c1fe7fb8f2bb89898 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Sun, 5 Apr 2026 01:08:37 +0200 Subject: [PATCH 2/2] tidy --- include/layers/ConvLayer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp index 64c5e338d..6a7c29a70 100644 --- a/include/layers/ConvLayer.hpp +++ b/include/layers/ConvLayer.hpp @@ -246,7 +246,7 @@ void Conv4D(const Tensor &input, const Tensor &kernel_, const Tensor &bias_, size_t group_start_channel = group * in_channels_per_group; size_t group_end_channel = group_start_channel + in_channels_per_group; size_t kernel_oc_base = oc * kernel_output_stride; - ValueType bias_value = ValueType{}; + auto bias_value = ValueType{}; if (bias_data != nullptr && oc < bias_data->size()) { bias_value = (*bias_data)[oc]; }