From 0ad10b5d73f28212abdd70236c9043f0a8a36fec Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <gooddoog@student.su>
Date: Sat, 4 Apr 2026 23:42:44 +0200
Subject: [PATCH 1/2] Optimize naive Conv4D with flat contiguous buffers

---
 include/layers/ConvLayer.hpp | 182 +++++++++++++++++------------------
 1 file changed, 90 insertions(+), 92 deletions(-)
diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp
index db51dda70..64c5e338d 100644
--- a/include/layers/ConvLayer.hpp
+++ b/include/layers/ConvLayer.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include <cmath>
+#include <cstddef>
 #include <stdexcept>
 #include <vector>
 
@@ -34,7 +35,7 @@ class ConvolutionalLayer : public Layer {
     dilations_ = 0;
   }
   ConvolutionalLayer(size_t step, size_t pads, size_t dilations,
-                     const Tensor& kernel, const Tensor& bias = Tensor(),
+                     const Tensor &kernel, const Tensor &bias = Tensor(),
                      size_t group = 1, bool useLegacyImpl = false)
       : Layer(kConvolution),
         kernel_(std::make_shared<Tensor>(kernel)),
@@ -73,10 +74,10 @@ class ConvolutionalLayer : public Layer {
     return useLegacyImpl_;
   }
 
-  void run(const std::vector<Tensor>& input,
-           std::vector<Tensor>& output) override;
-  void run(const std::vector<Tensor>& input, std::vector<Tensor>& output,
-           const RuntimeOptions& options) override;
+  void run(const std::vector<Tensor> &input,
+           std::vector<Tensor> &output) override;
+  void run(const std::vector<Tensor> &input, std::vector<Tensor> &output,
+           const RuntimeOptions &options) override;
 #ifdef ENABLE_STATISTIC_WEIGHTS
   Tensor get_weights() override {
     return *kernel_;
@@ -100,7 +101,7 @@ class ConvImpl : public LayerImpl<ValueType> {
   ConvImpl() = delete;
   ConvImpl(size_t stride, size_t pads, size_t dilations, int input_width,
            int input_height, int input_flow, size_t input_size,
-           const std::vector<ValueType>& bias)
+           const std::vector<ValueType> &bias)
       : input_width_(input_width),
         input_height_(input_height),
         input_flow_(input_flow),
@@ -110,10 +111,10 @@ class ConvImpl : public LayerImpl<ValueType> {
         input_size_(input_size),
         bias_(bias) {}
 
-  ConvImpl(const ConvImpl& c) = default;
+  ConvImpl(const ConvImpl &c) = default;
 
   [[nodiscard]] std::vector<ValueType> run(
-      const std::vector<ValueType>& input) const override {
+      const std::vector<ValueType> &input) const override {
     return input;
   }
 
@@ -184,8 +185,8 @@ class ConvImpl : public LayerImpl<ValueType> {
 
 // NCHW -> NCHW only
 template <typename ValueType>
-void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
-            Tensor& output, size_t stride_, size_t pads_, size_t group_,
+void Conv4D(const Tensor &input, const Tensor &kernel_, const Tensor &bias_,
+            Tensor &output, size_t stride_, size_t pads_, size_t group_,
             size_t dilations_, ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];
   size_t in_channels = input.get_shape()[1];
@@ -212,106 +213,103 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
   size_t out_width =
       ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_);
 
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input(
-      batch_size,
-      std::vector<std::vector<std::vector<ValueType>>>(
-          in_height + 2 * pads_,
-          std::vector<std::vector<ValueType>>(
-              in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));
-
   parallel::Options options;
   options.backend = backend;
 
-  parallel::parallel_for(batch_size, [&](size_t b) {
-    for (size_t h = 0; h < in_height; ++h) {
-      for (size_t w = 0; w < in_width; ++w) {
-        for (size_t c = 0; c < in_channels; ++c) {
-          padded_input[b][h + pads_][w + pads_][c] =
-              input.get<ValueType>({b, c, h, w});
-        }
-      }
-    }
-  }, options);
-
-  size_t dilated_kernel_height = (kernel_height - 1) * dilations_ + 1;
-  size_t dilated_kernel_width = (kernel_width - 1) * dilations_ + 1;
-
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel(
-      out_channels, std::vector<std::vector<std::vector<ValueType>>>(
-                        kernel_in_channels,
-                        std::vector<std::vector<ValueType>>(
-                            dilated_kernel_height,
-                            std::vector<ValueType>(dilated_kernel_width, 0))));
-
-  parallel::parallel_for(out_channels, [&](size_t oc) {
-    for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
-      for (size_t kh = 0; kh < kernel_height; ++kh) {
-        for (size_t kw = 0; kw < kernel_width; ++kw) {
-          dil_kernel[oc][ic][kh * dilations_][kw * dilations_] =
-              kernel_.get<ValueType>({oc, ic, kh, kw});
-        }
-      }
-    }
-  }, options);
+  const auto &input_data = *input.as<ValueType>();
+  const auto &kernel_data = *kernel_.as<ValueType>();
+  const std::vector<ValueType> *bias_data = nullptr;
+  if (!bias_.empty()) {
+    bias_data = bias_.as<ValueType>();
+  }
 
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
-      batch_size,
-      std::vector<std::vector<std::vector<ValueType>>>(
-          out_channels, std::vector<std::vector<ValueType>>(
-                            out_height, std::vector<ValueType>(out_width, 0))));
+  const size_t input_channel_stride = in_height * in_width;
+  const size_t input_batch_stride = in_channels * input_channel_stride;
+  const size_t kernel_channel_stride = kernel_height * kernel_width;
+  const size_t kernel_output_stride =
+      kernel_in_channels * kernel_channel_stride;
+  const size_t output_channel_stride = out_height * out_width;
+  const size_t output_batch_stride = out_channels * output_channel_stride;
+  const size_t in_channels_per_group = in_channels / group_;
+  const size_t out_channels_per_group = out_channels / group_;
+  const bool collapsed_kernel = dilations_ == 0;
 
+  Shape output_shape({batch_size, out_channels, out_height, out_width});
+  std::vector<ValueType> flat_output(output_shape.count(), 0);
   size_t total_work = batch_size * out_channels;
   parallel::parallel_for(total_work, [&](size_t idx) {
     size_t b = idx / out_channels;
     size_t oc = idx % out_channels;
+    size_t input_batch_base = b * input_batch_stride;
+    size_t output_base = b * output_batch_stride + oc * output_channel_stride;
+    size_t group = (group_ > 1) ? oc / out_channels_per_group : 0;
+    size_t group_start_channel = group * in_channels_per_group;
+    size_t group_end_channel = group_start_channel + in_channels_per_group;
+    size_t kernel_oc_base = oc * kernel_output_stride;
+    ValueType bias_value = ValueType{};
+    if (bias_data != nullptr && oc < bias_data->size()) {
+      bias_value = (*bias_data)[oc];
+    }
 
     for (size_t oh = 0; oh < out_height; ++oh) {
+      std::ptrdiff_t input_h_base = static_cast<std::ptrdiff_t>(oh * stride_) -
+                                    static_cast<std::ptrdiff_t>(pads_);
       for (size_t ow = 0; ow < out_width; ++ow) {
-        ValueType value = 0;
-        size_t h_start = oh * stride_;
-        size_t w_start = ow * stride_;
-
-        size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0;
-        size_t group_start_channel = group * (in_channels / group_);
-        size_t group_end_channel = (group + 1) * (in_channels / group_);
+        ValueType value = bias_value;
+        std::ptrdiff_t input_w_base =
+            static_cast<std::ptrdiff_t>(ow * stride_) -
+            static_cast<std::ptrdiff_t>(pads_);
+        size_t output_idx = output_base + oh * out_width + ow;
 
         for (size_t ic = group_start_channel; ic < group_end_channel; ++ic) {
           size_t kernel_ic = ic - group_start_channel;
-
-          for (size_t kh = 0; kh < dilated_kernel_height; ++kh) {
-            for (size_t kw = 0; kw < dilated_kernel_width; ++kw) {
-              size_t h_index = h_start + kh;
-              size_t w_index = w_start + kw;
-
-              if (h_index < padded_input[b].size() &&
-                  w_index < padded_input[b][h_index].size()) {
-                value += padded_input[b][h_index][w_index][ic] *
-                         dil_kernel[oc][kernel_ic][kh][kw];
-              }
+          size_t input_channel_base =
+              input_batch_base + ic * input_channel_stride;
+          size_t kernel_ic_base =
+              kernel_oc_base + kernel_ic * kernel_channel_stride;
+
+          if (collapsed_kernel) {
+            if (input_h_base >= 0 &&
+                input_h_base < static_cast<std::ptrdiff_t>(in_height) &&
+                input_w_base >= 0 &&
+                input_w_base < static_cast<std::ptrdiff_t>(in_width)) {
+              size_t input_idx = input_channel_base +
+                                 static_cast<size_t>(input_h_base) * in_width +
+                                 static_cast<size_t>(input_w_base);
+              size_t kernel_idx = kernel_ic_base + kernel_channel_stride - 1;
+              value += input_data[input_idx] * kernel_data[kernel_idx];
             }
+            continue;
           }
-        }
 
-        if (!bias_.empty() && oc < bias_.get_shape()[0]) {
-          value += bias_.get<ValueType>({oc});
-        }
+          for (size_t kh = 0; kh < kernel_height; ++kh) {
+            std::ptrdiff_t input_h =
+                input_h_base + static_cast<std::ptrdiff_t>(kh * dilations_);
+            if (input_h < 0 ||
+                input_h >= static_cast<std::ptrdiff_t>(in_height)) {
+              continue;
+            }
 
-        output_tensor[b][oc][oh][ow] = value;
-      }
-    }
-  }, options);
+            size_t input_row_base =
+                input_channel_base + static_cast<size_t>(input_h) * in_width;
+            size_t kernel_row_base = kernel_ic_base + kh * kernel_width;
 
-  Shape output_shape({batch_size, out_channels, out_height, out_width});
-  std::vector<ValueType> flat_output(batch_size * out_channels * out_height *
-                                     out_width);
+            for (size_t kw = 0; kw < kernel_width; ++kw) {
+              std::ptrdiff_t input_w =
+                  input_w_base + static_cast<std::ptrdiff_t>(kw * dilations_);
+              if (input_w < 0 ||
+                  input_w >= static_cast<std::ptrdiff_t>(in_width)) {
+                continue;
+              }
 
-  parallel::parallel_for(batch_size, [&](size_t b) {
-    size_t base_idx = b * out_channels * out_height * out_width;
-    for (size_t oc = 0; oc < out_channels; ++oc) {
-      for (size_t h = 0; h < out_height; ++h) {
-        for (size_t w = 0; w < out_width; ++w) {
-          flat_output[base_idx++] = output_tensor[b][oc][h][w];
+              value +=
+                  input_data[input_row_base + static_cast<size_t>(input_w)] *
+                  kernel_data[kernel_row_base + kw];
+            }
+          }
         }
+
+        flat_output[output_idx] = value;
       }
     }
   }, options);
@@ -320,8 +318,8 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
 }
 
 template <typename ValueType>
-void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
-                     const Tensor& bias_, Tensor& output, size_t stride_,
+void DepthwiseConv4D(const Tensor &input, const Tensor &kernel_,
+                     const Tensor &bias_, Tensor &output, size_t stride_,
                      size_t pads_, size_t dilations_,
                      ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];
@@ -388,8 +386,8 @@ void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
 
 // NCHW -> NCHW only (Legacy version)
 template <typename ValueType>
-void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
-                   const Tensor& bias_, Tensor& output, size_t stride_,
+void Conv4D_Legacy(const Tensor &input, const Tensor &kernel_,
+                   const Tensor &bias_, Tensor &output, size_t stride_,
                    size_t pads_, size_t dilations_,
                    ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];

From ea25dd94e4b31bdef85b6d0c1fe7fb8f2bb89898 Mon Sep 17 00:00:00 2001
From: Arseniy Obolenskiy <gooddoog@student.su>
Date: Sun, 5 Apr 2026 01:08:37 +0200
Subject: [PATCH 2/2] tidy

---
 include/layers/ConvLayer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp
index 64c5e338d..6a7c29a70 100644
--- a/include/layers/ConvLayer.hpp
+++ b/include/layers/ConvLayer.hpp
@@ -246,7 +246,7 @@ void Conv4D(const Tensor &input, const Tensor &kernel_, const Tensor &bias_,
     size_t group_start_channel = group * in_channels_per_group;
     size_t group_end_channel = group_start_channel + in_channels_per_group;
     size_t kernel_oc_base = oc * kernel_output_stride;
-    ValueType bias_value = ValueType{};
+    auto bias_value = ValueType{};
     if (bias_data != nullptr && oc < bias_data->size()) {
       bias_value = (*bias_data)[oc];
     }