From b8e52e3542cb565fe320efb5b5fd7a8dd576a69d Mon Sep 17 00:00:00 2001
From: AndreySorokin7 <andrey_sorokin_nn@mail,ru>
Date: Wed, 21 Jan 2026 14:05:03 +0300
Subject: [PATCH] fierst commit

---
 include/layers/ConvLayer.hpp                  | 491 +++++++-----------
 src/layers/ConvLayer.cpp                      |  57 +-
 test/single_layer/test_convlayer.cpp          |  13 +-
 .../test_convlayer_parall.cpp                 | 277 ++++++++++
 4 files changed, 485 insertions(+), 353 deletions(-)
 create mode 100644 test/single_layer_parall_version/test_convlayer_parall.cpp
diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp
index 720f68f3..57c521cc 100644
--- a/include/layers/ConvLayer.hpp
+++ b/include/layers/ConvLayer.hpp
@@ -1,7 +1,6 @@
 #pragma once
 #include <cmath>
 #include <stdexcept>
-#include <thread>
 #include <vector>
 
 #include "layers/Layer.hpp"
@@ -26,7 +25,6 @@ class ConvolutionalLayer : public Layer {
   Tensor kernel_;
   Tensor bias_;
   size_t group_;
-  ImplType implType_;
   bool useLegacyImpl_;
 
  public:
@@ -34,7 +32,6 @@ class ConvolutionalLayer : public Layer {
     stride_ = 0;
     pads_ = 0;
     dilations_ = 0;
-    implType_ = kDefault;
   }
   ConvolutionalLayer(size_t step, size_t pads, size_t dilations,
                      const Tensor& kernel, const Tensor& bias = Tensor(),
@@ -159,7 +156,7 @@ class ConvImpl : public LayerImpl<ValueType> {
 template <typename ValueType>
 void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
             Tensor& output, size_t stride_, size_t pads_, size_t group_,
-            size_t dilations_) {
+            size_t dilations_, ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];
   size_t in_channels = input.get_shape()[1];
   size_t in_height = input.get_shape()[2];
@@ -192,16 +189,22 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
           std::vector<std::vector<ValueType>>(
               in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));
 
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t h = 0; h < in_height; ++h) {
-      for (size_t w = 0; w < in_width; ++w) {
-        for (size_t c = 0; c < in_channels; ++c) {
-          padded_input[b][h + pads_][w + pads_][c] =
-              input.get<ValueType>({b, c, h, w});
+  parallel::Options options;
+  options.backend = backend;
+
+  parallel::parallel_for(
+      batch_size,
+      [&](size_t b) {
+        for (size_t h = 0; h < in_height; ++h) {
+          for (size_t w = 0; w < in_width; ++w) {
+            for (size_t c = 0; c < in_channels; ++c) {
+              padded_input[b][h + pads_][w + pads_][c] =
+                  input.get<ValueType>({b, c, h, w});
+            }
+          }
         }
-      }
-    }
-  }
+      },
+      options);
 
   size_t dilated_kernel_height = (kernel_height - 1) * dilations_ + 1;
   size_t dilated_kernel_width = (kernel_width - 1) * dilations_ + 1;
@@ -213,16 +216,19 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
                             dilated_kernel_height,
                             std::vector<ValueType>(dilated_kernel_width, 0))));
 
-  for (size_t oc = 0; oc < out_channels; ++oc) {
-    for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
-      for (size_t kh = 0; kh < kernel_height; ++kh) {
-        for (size_t kw = 0; kw < kernel_width; ++kw) {
-          dil_kernel[oc][ic][kh * dilations_][kw * dilations_] =
-              kernel_.get<ValueType>({oc, ic, kh, kw});
+  parallel::parallel_for(
+      out_channels,
+      [&](size_t oc) {
+        for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
+          for (size_t kh = 0; kh < kernel_height; ++kh) {
+            for (size_t kw = 0; kw < kernel_width; ++kw) {
+              dil_kernel[oc][ic][kh * dilations_][kw * dilations_] =
+                  kernel_.get<ValueType>({oc, ic, kh, kw});
+            }
+          }
         }
-      }
-    }
-  }
+      },
+      options);
 
   std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
       batch_size,
@@ -230,164 +236,20 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
           out_channels, std::vector<std::vector<ValueType>>(
                             out_height, std::vector<ValueType>(out_width, 0))));
 
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t oc = 0; oc < out_channels; ++oc) {
-      for (size_t oh = 0; oh < out_height; ++oh) {
-        for (size_t ow = 0; ow < out_width; ++ow) {
-          ValueType value = 0;
-          size_t h_start = oh * stride_;
-          size_t w_start = ow * stride_;
-
-          size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0;
-          size_t group_start_channel = group * (in_channels / group_);
-          size_t group_end_channel = (group + 1) * (in_channels / group_);
-
-          for (size_t ic = group_start_channel; ic < group_end_channel; ++ic) {
-            size_t kernel_ic = ic - group_start_channel;
-
-            for (size_t kh = 0; kh < dilated_kernel_height; ++kh) {
-              for (size_t kw = 0; kw < dilated_kernel_width; ++kw) {
-                size_t h_index = h_start + kh;
-                size_t w_index = w_start + kw;
-
-                if (h_index < padded_input[b].size() &&
-                    w_index < padded_input[b][h_index].size()) {
-                  value += padded_input[b][h_index][w_index][ic] *
-                           dil_kernel[oc][kernel_ic][kh][kw];
-                }
-              }
-            }
-          }
-
-          if (!bias_.empty() && oc < bias_.get_shape()[0]) {
-            value += bias_.get<ValueType>({oc});
-          }
-
-          output_tensor[b][oc][oh][ow] = value;
-        }
-      }
-    }
-  }
-
-  Shape output_shape({batch_size, out_channels, out_height, out_width});
-  std::vector<ValueType> flat_output(batch_size * out_channels * out_height *
-                                     out_width);
-
-  size_t index = 0;
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t oc = 0; oc < out_channels; ++oc) {
-      for (size_t h = 0; h < out_height; ++h) {
-        for (size_t w = 0; w < out_width; ++w) {
-          flat_output[index++] = output_tensor[b][oc][h][w];
-        }
-      }
-    }
-  }
-
-  output = make_tensor<ValueType>(flat_output, output_shape);
-}
-
-// NCHW -> NCHW only
-template <typename ValueType>
-void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
-               Tensor& output, size_t stride_, size_t pads_, size_t group_,
-               size_t dilations_) {
-  size_t batch_size = input.get_shape()[0];
-  size_t in_channels = input.get_shape()[1];
-  size_t in_height = input.get_shape()[2];
-  size_t in_width = input.get_shape()[3];
-
-  size_t kernel_out_channels = kernel_.get_shape()[0];
-  size_t kernel_in_channels = kernel_.get_shape()[1];
-  size_t kernel_height = kernel_.get_shape()[2];
-  size_t kernel_width = kernel_.get_shape()[3];
-
-  unsigned num_threads = std::thread::hardware_concurrency();
-  std::vector<std::thread> threads;
-  size_t chunk_size = batch_size / num_threads;
-
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input =
-      std::vector<std::vector<std::vector<std::vector<ValueType>>>>(
-          batch_size, std::vector<std::vector<std::vector<ValueType>>>(
-                          in_height + 2 * pads_,
-                          std::vector<std::vector<ValueType>>(
-                              in_width + 2 * pads_,
-                              std::vector<ValueType>(in_channels, 0))));
-  auto pad_input = [&](size_t start_b, size_t end_b) {
-    for (size_t b = start_b; b < end_b; ++b) {
-      for (size_t h = 0; h < in_height; ++h) {
-        for (size_t w = 0; w < in_width; ++w) {
-          for (size_t c = 0; c < in_channels; ++c) {
-            padded_input[b][h + pads_][w + pads_][c] =
-                input.get<ValueType>({b, c, h, w});
-          }
-        }
-      }
-    }
-  };
-
-  for (unsigned i = 0; i < num_threads; ++i) {
-    size_t start = i * chunk_size;
-    size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
-    threads.emplace_back(pad_input, start, end);
-  }
-  for (auto& t : threads) t.join();
-  threads.clear();
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel =
-      std::vector<std::vector<std::vector<std::vector<ValueType>>>>(
-          kernel_height * dilations_ + 1 - dilations_,
-          std::vector<std::vector<std::vector<ValueType>>>(
-              kernel_width * dilations_ + 1 - dilations_,
-              std::vector<std::vector<ValueType>>(
-                  kernel_in_channels,
-                  std::vector<ValueType>(kernel_out_channels, 0))));
-
-  auto dilate_kernel = [&](size_t start_oc, size_t end_oc) {
-    for (size_t oc = start_oc; oc < end_oc; ++oc) {
-      for (size_t h = 0; h < kernel_height; ++h) {
-        for (size_t w = 0; w < kernel_width; ++w) {
-          for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
-            dil_kernel[h * dilations_][w * dilations_][ic][oc] =
-                kernel_.get<ValueType>({oc, ic, h, w});
-          }
-        }
-      }
-    }
-  };
-
-  chunk_size = kernel_out_channels / num_threads;
-  for (unsigned i = 0; i < num_threads; ++i) {
-    size_t start = i * chunk_size;
-    size_t end =
-        (i == num_threads - 1) ? kernel_out_channels : start + chunk_size;
-    threads.emplace_back(dilate_kernel, start, end);
-  }
-  for (auto& t : threads) t.join();
-  threads.clear();
-
-  size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_,
-                                           pads_, dilations_);
-  size_t out_width =
-      ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_);
+  size_t total_work = batch_size * out_channels;
+  parallel::parallel_for(
+      total_work,
+      [&](size_t idx) {
+        size_t b = idx / out_channels;
+        size_t oc = idx % out_channels;
 
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
-      batch_size, std::vector<std::vector<std::vector<ValueType>>>(
-                      kernel_out_channels,
-                      std::vector<std::vector<ValueType>>(
-                          out_height, std::vector<ValueType>(out_width, 0))));
-
-  auto compute_conv = [&](size_t start_oc, size_t end_oc) {
-    size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_;
-    size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_;
-
-    for (size_t b = 0; b < batch_size; ++b) {
-      for (size_t oc = start_oc; oc < end_oc; ++oc) {
-        for (size_t oh = 0; oh < out_height; oh++) {
-          for (size_t ow = 0; ow < out_width; ow++) {
+        for (size_t oh = 0; oh < out_height; ++oh) {
+          for (size_t ow = 0; ow < out_width; ++ow) {
             ValueType value = 0;
+            size_t h_start = oh * stride_;
+            size_t w_start = ow * stride_;
 
-            size_t group =
-                (group_ > 1) ? oc / (kernel_out_channels / group_) : 0;
+            size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0;
             size_t group_start_channel = group * (in_channels / group_);
             size_t group_end_channel = (group + 1) * (in_channels / group_);
 
@@ -397,72 +259,54 @@ void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
 
               for (size_t kh = 0; kh < dilated_kernel_height; ++kh) {
                 for (size_t kw = 0; kw < dilated_kernel_width; ++kw) {
-                  size_t h_index = oh * stride_ + kh;
-                  size_t w_index = ow * stride_ + kw;
+                  size_t h_index = h_start + kh;
+                  size_t w_index = w_start + kw;
 
                   if (h_index < padded_input[b].size() &&
                       w_index < padded_input[b][h_index].size()) {
                     value += padded_input[b][h_index][w_index][ic] *
-                             dil_kernel[kh][kw][kernel_ic][oc];
+                             dil_kernel[oc][kernel_ic][kh][kw];
                   }
                 }
               }
             }
 
-            if (!bias_.empty()) {
-              output_tensor[b][oc][oh][ow] =
-                  value + (*bias_.as<ValueType>())[oc];
-            } else {
-              output_tensor[b][oc][oh][ow] = value;
+            if (!bias_.empty() && oc < bias_.get_shape()[0]) {
+              value += bias_.get<ValueType>({oc});
             }
+
+            output_tensor[b][oc][oh][ow] = value;
           }
         }
-      }
-    }
-  };
-
-  chunk_size = kernel_out_channels / num_threads;
-  for (unsigned i = 0; i < num_threads; ++i) {
-    size_t start = i * chunk_size;
-    size_t end =
-        (i == num_threads - 1) ? kernel_out_channels : start + chunk_size;
-    threads.emplace_back(compute_conv, start, end);
-  }
-  for (auto& t : threads) t.join();
-  threads.clear();
+      },
+      options);
 
-  Shape sh({batch_size, kernel_out_channels, out_height, out_width});
-  std::vector<ValueType> one_d_vector(batch_size * out_height * out_width *
-                                      kernel_out_channels);
+  Shape output_shape({batch_size, out_channels, out_height, out_width});
+  std::vector<ValueType> flat_output(batch_size * out_channels * out_height *
+                                     out_width);
 
-  auto flatten_output = [&](size_t start_b, size_t end_b) {
-    size_t index_1d = start_b * kernel_out_channels * out_height * out_width;
-    for (size_t i = start_b; i < end_b; ++i) {
-      for (size_t l = 0; l < kernel_out_channels; ++l) {
-        for (size_t j = 0; j < out_height; ++j) {
-          for (size_t k = 0; k < out_width; ++k) {
-            one_d_vector[index_1d++] = output_tensor[i][l][j][k];
+  parallel::parallel_for(
+      batch_size,
+      [&](size_t b) {
+        size_t base_idx = b * out_channels * out_height * out_width;
+        for (size_t oc = 0; oc < out_channels; ++oc) {
+          for (size_t h = 0; h < out_height; ++h) {
+            for (size_t w = 0; w < out_width; ++w) {
+              flat_output[base_idx++] = output_tensor[b][oc][h][w];
+            }
           }
         }
-      }
-    }
-  };
-
-  chunk_size = batch_size / num_threads;
-  for (unsigned i = 0; i < num_threads; ++i) {
-    size_t start = i * chunk_size;
-    size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size;
-    threads.emplace_back(flatten_output, start, end);
-  }
-  for (auto& t : threads) t.join();
+      },
+      options);
 
-  output = make_tensor<ValueType>(one_d_vector, sh);
+  output = make_tensor<ValueType>(flat_output, output_shape);
 }
 
 template <typename ValueType>
 void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
                      const Tensor& bias_, Tensor& output, size_t stride_,
-                     size_t pads_, size_t dilations_) {
+                     size_t pads_, size_t dilations_,
+                     ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];
   size_t channels = input.get_shape()[1];
   size_t in_height = input.get_shape()[2];
@@ -485,44 +329,55 @@ void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
   Tensor output_tensor(Shape({batch_size, channels, out_height, out_width}),
                        input.get_type());
 
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t oh = 0; oh < out_height; ++oh) {
-        for (size_t ow = 0; ow < out_width; ++ow) {
-          ValueType sum = 0;
+  parallel::Options options;
+  options.backend = backend;
 
-          for (size_t kh = 0; kh < kernel_height; ++kh) {
-            for (size_t kw = 0; kw < kernel_width; ++kw) {
-              size_t ih = oh * stride_ + kh * dilations_ - pads_;
-              size_t iw = ow * stride_ + kw * dilations_ - pads_;
+  size_t total_work = batch_size * channels;
 
-              if (ih < in_height && iw < in_width) {
-                auto input_val = input.get<ValueType>({b, c, ih, iw});
-                auto kernel_val = kernel_.get<ValueType>({c, 0, kh, kw});
+  parallel::parallel_for(
+      total_work,
+      [&](size_t idx) {
+        size_t b = idx / channels;
+        size_t c = idx % channels;
 
-                sum += input_val * kernel_val;
+        for (size_t oh = 0; oh < out_height; ++oh) {
+          for (size_t ow = 0; ow < out_width; ++ow) {
+            ValueType sum = 0;
+
+            for (size_t kh = 0; kh < kernel_height; ++kh) {
+              for (size_t kw = 0; kw < kernel_width; ++kw) {
+                size_t ih = oh * stride_ + kh * dilations_;
+                size_t iw = ow * stride_ + kw * dilations_;
+
+                if (ih >= pads_ && iw >= pads_ && (ih - pads_) < in_height &&
+                    (iw - pads_) < in_width) {
+                  auto input_val =
+                      input.get<ValueType>({b, c, ih - pads_, iw - pads_});
+                  auto kernel_val = kernel_.get<ValueType>({c, 0, kh, kw});
+                  sum += input_val * kernel_val;
+                }
               }
             }
-          }
 
-          if (!bias_.empty() && c < bias_.get_shape()[0]) {
-            sum += bias_.get<ValueType>({c});
-          }
+            if (!bias_.empty() && c < bias_.get_shape()[0]) {
+              sum += bias_.get<ValueType>({c});
+            }
 
-          output_tensor.set<ValueType>({b, c, oh, ow}, sum);
+            output_tensor.set<ValueType>({b, c, oh, ow}, sum);
+          }
         }
-      }
-    }
-  }
+      },
+      options);
 
   output = output_tensor;
 }
 
-// NCHW -> NCHW only
+// NCHW -> NCHW only (Legacy version)
 template <typename ValueType>
 void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
                    const Tensor& bias_, Tensor& output, size_t stride_,
-                   size_t pads_, size_t dilations_) {
+                   size_t pads_, size_t dilations_,
+                   ParBackend backend = ParBackend::kSeq) {
   size_t batch_size = input.get_shape()[0];
   size_t in_height = input.get_shape()[2];
   size_t in_width = input.get_shape()[3];
@@ -533,41 +388,54 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
   size_t kernel_in_channels = kernel_.get_shape()[2];
   size_t kernel_out_channels = kernel_.get_shape()[3];
 
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input =
-      std::vector<std::vector<std::vector<std::vector<ValueType>>>>(
-          batch_size, std::vector<std::vector<std::vector<ValueType>>>(
-                          in_height + 2 * pads_,
-                          std::vector<std::vector<ValueType>>(
-                              in_width + 2 * pads_,
-                              std::vector<ValueType>(in_channels, 0))));
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t h = 0; h < in_height; ++h) {
-      for (size_t w = 0; w < in_width; ++w) {
-        for (size_t c = 0; c < in_channels; ++c) {
-          padded_input[b][h + pads_][w + pads_][c] =
-              input.get<ValueType>({b, c, h, w});
+  parallel::Options options;
+  options.backend = backend;
+
+  std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input(
+      batch_size,
+      std::vector<std::vector<std::vector<ValueType>>>(
+          in_height + 2 * pads_,
+          std::vector<std::vector<ValueType>>(
+              in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));
+
+  parallel::parallel_for(
+      batch_size,
+      [&](size_t b) {
+        for (size_t h = 0; h < in_height; ++h) {
+          for (size_t w = 0; w < in_width; ++w) {
+            for (size_t c = 0; c < in_channels; ++c) {
+              padded_input[b][h + pads_][w + pads_][c] =
+                  input.get<ValueType>({b, c, h, w});
+            }
+          }
         }
-      }
-    }
-  }
-  std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel =
-      std::vector<std::vector<std::vector<std::vector<ValueType>>>>(
-          kernel_height * dilations_ + 1 - dilations_,
-          std::vector<std::vector<std::vector<ValueType>>>(
-              kernel_width * dilations_ + 1 - dilations_,
-              std::vector<std::vector<ValueType>>(
-                  kernel_in_channels,
-                  std::vector<ValueType>(kernel_out_channels, 0))));
-  for (size_t b = 0; b < kernel_out_channels; ++b) {
-    for (size_t h = 0; h < kernel_height; ++h) {
-      for (size_t w = 0; w < kernel_width; ++w) {
-        for (size_t c = 0; c < kernel_in_channels; ++c) {
-          dil_kernel[h * dilations_][w * dilations_][c][b] =
-              kernel_.get<ValueType>({h, w, c, b});
+      },
+      options);
+
+  size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_;
+  size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_;
+
+  std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel(
+      dilated_kernel_height,
+      std::vector<std::vector<std::vector<ValueType>>>(
+          dilated_kernel_width,
+          std::vector<std::vector<ValueType>>(
+              kernel_in_channels,
+              std::vector<ValueType>(kernel_out_channels, 0))));
+
+  parallel::parallel_for(
+      kernel_out_channels,
+      [&](size_t b) {
+        for (size_t h = 0; h < kernel_height; ++h) {
+          for (size_t w = 0; w < kernel_width; ++w) {
+            for (size_t c = 0; c < kernel_in_channels; ++c) {
+              dil_kernel[h * dilations_][w * dilations_][c][b] =
+                  kernel_.get<ValueType>({h, w, c, b});
+            }
+          }
         }
-      }
-    }
-  }
+      },
+      options);
 
   size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_,
                                            pads_, dilations_);
@@ -579,44 +447,55 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
                       kernel_out_channels,
                       std::vector<std::vector<ValueType>>(
                           out_height, std::vector<ValueType>(out_width, 0))));
-  for (size_t b = 0; b < batch_size; ++b) {
-    for (size_t c = 0; c < kernel_out_channels; ++c) {
-      for (size_t i = 0; i < out_height; i += stride_) {
-        for (size_t j = 0; j < out_width; j += stride_) {
-          ValueType value = 0;
-          for (size_t ic = 0; ic < in_channels; ++ic) {
-            for (size_t h = 0; h < kernel_height * dilations_ + 1 - dilations_;
-                 ++h) {
-              for (size_t w = 0; w < kernel_width * dilations_ + 1 - dilations_;
-                   ++w) {
-                value +=
-                    padded_input[b][i + h][j + w][ic] * dil_kernel[h][w][ic][c];
+
+  size_t total_work = batch_size * kernel_out_channels;
+
+  parallel::parallel_for(
+      total_work,
+      [&](size_t idx) {
+        size_t b = idx / kernel_out_channels;
+        size_t c = idx % kernel_out_channels;
+
+        for (size_t i = 0; i < out_height; i += stride_) {
+          for (size_t j = 0; j < out_width; j += stride_) {
+            ValueType value = 0;
+            for (size_t ic = 0; ic < in_channels; ++ic) {
+              for (size_t h = 0; h < dilated_kernel_height; ++h) {
+                for (size_t w = 0; w < dilated_kernel_width; ++w) {
+                  value += padded_input[b][i + h][j + w][ic] *
+                           dil_kernel[h][w][ic][c];
+                }
               }
             }
-          }
-          if (!bias_.empty()) {
-            output_tensor[b][c][i][j] = value + (*bias_.as<ValueType>())[c];
-          } else {
-            output_tensor[b][c][i][j] = value;
+            if (!bias_.empty()) {
+              output_tensor[b][c][i][j] = value + (*bias_.as<ValueType>())[c];
+            } else {
+              output_tensor[b][c][i][j] = value;
+            }
           }
         }
-      }
-    }
-  }
+      },
+      options);
 
   Shape sh({batch_size, kernel_out_channels, out_height, out_width});
   std::vector<ValueType> one_d_vector(batch_size * out_height * out_width *
                                       kernel_out_channels);
-  size_t index_1d = 0;
-  for (size_t i = 0; i < batch_size; ++i) {
-    for (size_t l = 0; l < kernel_out_channels; ++l) {
-      for (size_t j = 0; j < out_height; ++j) {
-        for (size_t k = 0; k < out_width; ++k) {
-          one_d_vector[index_1d++] = output_tensor[i][l][j][k];
+
+  parallel::parallel_for(
+      batch_size,
+      [&](size_t i) {
+        size_t base_idx = i * kernel_out_channels * out_height * out_width;
+        for (size_t l = 0; l < kernel_out_channels; ++l) {
+          for (size_t j = 0; j < out_height; ++j) {
+            for (size_t k = 0; k < out_width; ++k) {
+              one_d_vector[base_idx++] = output_tensor[i][l][j][k];
+            }
+          }
         }
-      }
-    }
-  }
+      },
+      options);
+
   output = make_tensor<ValueType>(one_d_vector, sh);
 }
+
 }  // namespace it_lab_ai
diff --git a/src/layers/ConvLayer.cpp b/src/layers/ConvLayer.cpp
index 60bce718..63d7ebad 100644
--- a/src/layers/ConvLayer.cpp
+++ b/src/layers/ConvLayer.cpp
@@ -17,16 +17,19 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
   if (input[0].get_shape().dims() != 4) {
     throw std::out_of_range("input must be 4-dimensional");
   }
+
+  ParBackend backend = options.getEffectiveParBackend();
+
   if (group_ > 1) {
     if (group_ == input[0].get_shape()[1] && group_ == kernel_.get_shape()[0]) {
       switch (input[0].get_type()) {
         case Type::kFloat:
           DepthwiseConv4D<float>(input[0], kernel_, bias_, output[0], stride_,
-                                 pads_, dilations_);
+                                 pads_, dilations_, backend);
           break;
         case Type::kInt:
           DepthwiseConv4D<int>(input[0], kernel_, bias_, output[0], stride_,
-                               pads_, dilations_);
+                               pads_, dilations_, backend);
           break;
         default:
           throw std::runtime_error(
@@ -35,17 +38,7 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
       return;
     }
   }
-  if (options.parallel) {
-    switch (options.par_backend) {
-      case ParBackend::kThreads:
-        implType_ = kSTL;
-        break;
-      case ParBackend::kSeq:
-      default:
-        implType_ = kDefault;
-        break;
-    }
-  }
+
   switch (input[0].get_type()) {
     case Type::kInt: {
       if (kernel_.get_shape().dims() == 2) {
@@ -96,18 +89,8 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
                     2)),
             sh);
       } else {
-        switch (implType_) {
-          case kSTL: {
-            Conv4DSTL<int>(input[0], kernel_, bias_, output[0], stride_, pads_,
-                           group_, dilations_);
-            break;
-          }
-          default: {
-            Conv4D<int>(input[0], kernel_, bias_, output[0], stride_, pads_,
-                        group_, dilations_);
-            break;
-          }
-        }
+        Conv4D<int>(input[0], kernel_, bias_, output[0], stride_, pads_, group_,
+                    dilations_, backend);
       }
       break;
     }
@@ -162,26 +145,16 @@ void ConvolutionalLayer::run(const std::vector<Tensor>& input,
       } else {
         if (useLegacyImpl_) {
           Conv4D_Legacy<float>(input[0], kernel_, bias_, output[0], stride_,
-                               pads_, dilations_);
+                               pads_, dilations_, backend);
         } else {
-          switch (implType_) {
-            case kSTL: {
-              Conv4DSTL<float>(input[0], kernel_, bias_, output[0], stride_,
-                               pads_, group_, dilations_);
-              break;
-            }
-            default: {
-              Conv4D<float>(input[0], kernel_, bias_, output[0], stride_, pads_,
-                            group_, dilations_);
-              break;
-            }
-          }
+          Conv4D<float>(input[0], kernel_, bias_, output[0], stride_, pads_,
+                        group_, dilations_, backend);
         }
-        break;
-      }
-      default: {
-        throw std::runtime_error("Unsupported tensor type");
       }
+      break;
+    }
+    default: {
+      throw std::runtime_error("Unsupported tensor type");
     }
   }
 }
diff --git a/test/single_layer/test_convlayer.cpp b/test/single_layer/test_convlayer.cpp
index 664ac313..0ffbd6fa 100644
--- a/test/single_layer/test_convlayer.cpp
+++ b/test/single_layer/test_convlayer.cpp
@@ -388,7 +388,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatWithGroups) {
   std::vector<float> output_vec(16, 0.0f);
   Tensor output = make_tensor(output_vec, output_shape);
 
-  Conv4DSTL<float>(input, kernel, Tensor(), output, 1, 0, 2, 1);
+  Conv4D<float>(input, kernel, Tensor(), output, 1, 0, 2, 1,
+                ParBackend::kThreads);
 
   std::vector<float> result = *output.as<float>();
 
@@ -423,7 +424,7 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatComplex) {
   std::vector<float> output_vec(8, 0.0f);
   Tensor output = make_tensor(output_vec, output_shape);
 
-  Conv4DSTL<float>(input, kernel, bias, output, 1, 0, 1, 1);
+  Conv4D<float>(input, kernel, bias, output, 1, 0, 1, 1, ParBackend::kThreads);
 
   std::vector<float> result = *output.as<float>();
 
@@ -495,7 +496,7 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatBasic) {
   std::vector<float> output_vec(8, 0.0f);
   Tensor output = make_tensor(output_vec, output_shape);
 
-  Conv4DSTL<float>(input, kernel, bias, output, 1, 0, 1, 1);
+  Conv4D<float>(input, kernel, bias, output, 1, 0, 1, 1, ParBackend::kThreads);
 
   std::vector<float> result = *output.as<float>();
 
@@ -523,7 +524,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatWithPaddingAndStride) {
       0.0f);
   Tensor output = make_tensor(output_vec, output_shape);
 
-  Conv4DSTL<float>(input, kernel, Tensor(), output, 2, 1, 1, 1);
+  Conv4D<float>(input, kernel, Tensor(), output, 2, 1, 1, 1,
+                ParBackend::kThreads);
 
   std::vector<float> result = *output.as<float>();
 
@@ -547,7 +549,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatCompareWithConv4D) {
   Shape output_shape2({1, 1, 1, 1});
   std::vector<float> output_vec2(1, 0.0f);
   Tensor output2 = make_tensor(output_vec2, output_shape2);
-  Conv4DSTL<float>(input, kernel, Tensor(), output2, 1, 0, 1, 1);
+  Conv4D<float>(input, kernel, Tensor(), output2, 1, 0, 1, 1,
+                ParBackend::kThreads);
 
   float result1 = (*output1.as<float>())[0];
   float result2 = (*output2.as<float>())[0];
diff --git a/test/single_layer_parall_version/test_convlayer_parall.cpp b/test/single_layer_parall_version/test_convlayer_parall.cpp
new file mode 100644
index 00000000..c2420eae
--- /dev/null
+++ b/test/single_layer_parall_version/test_convlayer_parall.cpp
@@ -0,0 +1,277 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "layers/ConvLayer.hpp"
+
+#define ENABLE_TIMING_OUTPUT 1
+
+#if ENABLE_TIMING_OUTPUT
+#define PRINT_TIMING(msg) std::cout << msg << std::endl
+#else
+#define PRINT_TIMING(msg) ((void)0)
+#endif
+
+using namespace it_lab_ai;
+
+TEST(convlayer_parall, parallel_conv_basic) {
+  size_t batch_size = 32;
+  std::vector<float> image(batch_size * 3 * 224 * 224, 1.0f);
+  Shape input_shape({batch_size, 3, 224, 224});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<float> kernelvec(64 * 3 * 3 * 3, 1.0f);
+  Shape kernel_shape({64, 3, 3, 3});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  size_t out_height = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  size_t out_width = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  Shape output_shape({batch_size, 64, out_height, out_width});
+  std::vector<float> output_vec(batch_size * 64 * out_height * out_width, 0.0f);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(1, 1, 1, kernel);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("Backend " << static_cast<int>(backend)
+                            << " time: " << duration.count()
+                            << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    EXPECT_EQ(out[0].get_shape()[1], 64);
+  }
+}
+
+TEST(convlayer_parall, parallel_conv_stride2) {
+  size_t batch_size = 64;
+  std::vector<float> image(batch_size * 16 * 112 * 112, 1.0f);
+  Shape input_shape({batch_size, 16, 112, 112});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<float> kernelvec(32 * 16 * 3 * 3, 1.0f);
+  Shape kernel_shape({32, 16, 3, 3});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  size_t out_height = (112 + 2 * 1 - 1 * (3 - 1) - 1) / 2 + 1;
+  size_t out_width = (112 + 2 * 1 - 1 * (3 - 1) - 1) / 2 + 1;
+  Shape output_shape({batch_size, 32, out_height, out_width});
+  std::vector<float> output_vec(batch_size * 32 * out_height * out_width, 0.0f);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(2, 1, 1, kernel);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("Backend " << static_cast<int>(backend)
+                            << " time: " << duration.count()
+                            << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    EXPECT_EQ(out[0].get_shape()[2], out_height);
+    EXPECT_EQ(out[0].get_shape()[3], out_width);
+  }
+}
+
+TEST(convlayer_parall, parallel_depthwise_conv) {
+  size_t batch_size = 128;
+  std::vector<float> image(batch_size * 32 * 56 * 56, 1.0f);
+  Shape input_shape({batch_size, 32, 56, 56});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<float> kernelvec(32 * 1 * 3 * 3, 1.0f);
+  Shape kernel_shape({32, 1, 3, 3});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  size_t out_height = (56 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  size_t out_width = (56 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  Shape output_shape({batch_size, 32, out_height, out_width});
+  std::vector<float> output_vec(batch_size * 32 * out_height * out_width, 0.0f);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(1, 1, 1, kernel, Tensor(), 32);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("Depthwise Backend " << static_cast<int>(backend)
+                                      << " time: " << duration.count()
+                                      << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    EXPECT_EQ(out[0].get_shape()[1], 32);
+  }
+}
+
+TEST(convlayer_parall, parallel_conv_with_bias) {
+  size_t batch_size = 16;
+  std::vector<int> image(batch_size * 16 * 28 * 28, 1);
+  Shape input_shape({batch_size, 16, 28, 28});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<int> kernelvec(36 * 16 * 5 * 5, 1);
+  Shape kernel_shape({36, 16, 5, 5});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  std::vector<int> biasvec(36, 10);
+  Tensor bias = make_tensor(biasvec, Shape({36}));
+
+  size_t pads = (kernel.get_shape()[2] - 1) / 2;
+  size_t out_height = (28 + 2 * pads - 1 * (5 - 1) - 1) / 1 + 1;
+  size_t out_width = (28 + 2 * pads - 1 * (5 - 1) - 1) / 1 + 1;
+  Shape output_shape({batch_size, 36, out_height, out_width});
+  std::vector<int> output_vec(batch_size * 36 * out_height * out_width, 0);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(1, pads, 1, kernel, bias);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("Backend " << static_cast<int>(backend)
+                            << " time: " << duration.count()
+                            << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    std::vector<int> result = *out[0].as<int>();
+    EXPECT_GT(result[0], 0);
+  }
+}
+
+TEST(convlayer_parall, parallel_conv_large_kernel) {
+  size_t batch_size = 8;
+  std::vector<float> image(batch_size * 3 * 128 * 128, 1.0f);
+  Shape input_shape({batch_size, 3, 128, 128});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<float> kernelvec(16 * 3 * 7 * 7, 1.0f);
+  Shape kernel_shape({16, 3, 7, 7});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  size_t pads = 3;
+  size_t out_height = (128 + 2 * pads - 1 * (7 - 1) - 1) / 2 + 1;
+  size_t out_width = (128 + 2 * pads - 1 * (7 - 1) - 1) / 2 + 1;
+  Shape output_shape({batch_size, 16, out_height, out_width});
+  std::vector<float> output_vec(batch_size * 16 * out_height * out_width, 0.0f);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(2, pads, 1, kernel);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("7x7 Kernel Backend " << static_cast<int>(backend)
+                                       << " time: " << duration.count()
+                                       << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    EXPECT_EQ(out[0].get_shape()[1], 16);
+  }
+}
+
+TEST(convlayer_parall, parallel_conv_single_image) {
+  size_t batch_size = 1;
+  std::vector<float> image(batch_size * 3 * 224 * 224, 1.0f);
+  Shape input_shape({batch_size, 3, 224, 224});
+  Tensor input = make_tensor(image, input_shape);
+
+  std::vector<float> kernelvec(64 * 3 * 3 * 3, 1.0f);
+  Shape kernel_shape({64, 3, 3, 3});
+  Tensor kernel = make_tensor(kernelvec, kernel_shape);
+
+  size_t out_height = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  size_t out_width = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1;
+  Shape output_shape({batch_size, 64, out_height, out_width});
+  std::vector<float> output_vec(batch_size * 64 * out_height * out_width, 0.0f);
+  Tensor output = make_tensor(output_vec, output_shape);
+
+  ConvolutionalLayer layer(1, 1, 1, kernel);
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING("Single Image Backend " << static_cast<int>(backend)
+                                         << " time: " << duration.count()
+                                         << " ms (batch=" << batch_size << ")");
+
+    EXPECT_EQ(out[0].get_shape()[0], batch_size);
+    EXPECT_EQ(out[0].get_shape()[1], 64);
+  }
+}
\ No newline at end of file