From b8e52e3542cb565fe320efb5b5fd7a8dd576a69d Mon Sep 17 00:00:00 2001 From: AndreySorokin7 Date: Wed, 21 Jan 2026 14:05:03 +0300 Subject: [PATCH] fierst commit --- include/layers/ConvLayer.hpp | 491 +++++++----------- src/layers/ConvLayer.cpp | 57 +- test/single_layer/test_convlayer.cpp | 13 +- .../test_convlayer_parall.cpp | 277 ++++++++++ 4 files changed, 485 insertions(+), 353 deletions(-) create mode 100644 test/single_layer_parall_version/test_convlayer_parall.cpp diff --git a/include/layers/ConvLayer.hpp b/include/layers/ConvLayer.hpp index 720f68f3..57c521cc 100644 --- a/include/layers/ConvLayer.hpp +++ b/include/layers/ConvLayer.hpp @@ -1,7 +1,6 @@ #pragma once #include #include -#include #include #include "layers/Layer.hpp" @@ -26,7 +25,6 @@ class ConvolutionalLayer : public Layer { Tensor kernel_; Tensor bias_; size_t group_; - ImplType implType_; bool useLegacyImpl_; public: @@ -34,7 +32,6 @@ class ConvolutionalLayer : public Layer { stride_ = 0; pads_ = 0; dilations_ = 0; - implType_ = kDefault; } ConvolutionalLayer(size_t step, size_t pads, size_t dilations, const Tensor& kernel, const Tensor& bias = Tensor(), @@ -159,7 +156,7 @@ class ConvImpl : public LayerImpl { template void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, Tensor& output, size_t stride_, size_t pads_, size_t group_, - size_t dilations_) { + size_t dilations_, ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; size_t in_channels = input.get_shape()[1]; size_t in_height = input.get_shape()[2]; @@ -192,16 +189,22 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, std::vector>( in_width + 2 * pads_, std::vector(in_channels, 0)))); - for (size_t b = 0; b < batch_size; ++b) { - for (size_t h = 0; h < in_height; ++h) { - for (size_t w = 0; w < in_width; ++w) { - for (size_t c = 0; c < in_channels; ++c) { - padded_input[b][h + pads_][w + pads_][c] = - input.get({b, c, h, w}); + parallel::Options options; + options.backend = backend; + + parallel::parallel_for( + batch_size, + [&](size_t b) { + for (size_t h = 0; h < in_height; ++h) { + for (size_t w = 0; w < in_width; ++w) { + for (size_t c = 0; c < in_channels; ++c) { + padded_input[b][h + pads_][w + pads_][c] = + input.get({b, c, h, w}); + } + } } - } - } - } + }, + options); size_t dilated_kernel_height = (kernel_height - 1) * dilations_ + 1; size_t dilated_kernel_width = (kernel_width - 1) * dilations_ + 1; @@ -213,16 +216,19 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, dilated_kernel_height, std::vector(dilated_kernel_width, 0)))); - for (size_t oc = 0; oc < out_channels; ++oc) { - for (size_t ic = 0; ic < kernel_in_channels; ++ic) { - for (size_t kh = 0; kh < kernel_height; ++kh) { - for (size_t kw = 0; kw < kernel_width; ++kw) { - dil_kernel[oc][ic][kh * dilations_][kw * dilations_] = - kernel_.get({oc, ic, kh, kw}); + parallel::parallel_for( + out_channels, + [&](size_t oc) { + for (size_t ic = 0; ic < kernel_in_channels; ++ic) { + for (size_t kh = 0; kh < kernel_height; ++kh) { + for (size_t kw = 0; kw < kernel_width; ++kw) { + dil_kernel[oc][ic][kh * dilations_][kw * dilations_] = + kernel_.get({oc, ic, kh, kw}); + } + } } - } - } - } + }, + options); std::vector>>> output_tensor( batch_size, @@ -230,164 +236,20 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, out_channels, std::vector>( out_height, std::vector(out_width, 0)))); - for (size_t b = 0; b < batch_size; ++b) { - for (size_t oc = 0; oc < out_channels; ++oc) { - for (size_t oh = 0; oh < out_height; ++oh) { - for (size_t ow = 0; ow < out_width; ++ow) { - ValueType value = 0; - size_t h_start = oh * stride_; - size_t w_start = ow * stride_; - - size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0; - size_t group_start_channel = group * (in_channels / group_); - size_t group_end_channel = (group + 1) * (in_channels / group_); - - for (size_t ic = group_start_channel; ic < group_end_channel; ++ic) { - size_t kernel_ic = ic - group_start_channel; - - for (size_t kh = 0; kh < dilated_kernel_height; ++kh) { - for (size_t kw = 0; kw < dilated_kernel_width; ++kw) { - size_t h_index = h_start + kh; - size_t w_index = w_start + kw; - - if (h_index < padded_input[b].size() && - w_index < padded_input[b][h_index].size()) { - value += padded_input[b][h_index][w_index][ic] * - dil_kernel[oc][kernel_ic][kh][kw]; - } - } - } - } - - if (!bias_.empty() && oc < bias_.get_shape()[0]) { - value += bias_.get({oc}); - } - - output_tensor[b][oc][oh][ow] = value; - } - } - } - } - - Shape output_shape({batch_size, out_channels, out_height, out_width}); - std::vector flat_output(batch_size * out_channels * out_height * - out_width); - - size_t index = 0; - for (size_t b = 0; b < batch_size; ++b) { - for (size_t oc = 0; oc < out_channels; ++oc) { - for (size_t h = 0; h < out_height; ++h) { - for (size_t w = 0; w < out_width; ++w) { - flat_output[index++] = output_tensor[b][oc][h][w]; - } - } - } - } - - output = make_tensor(flat_output, output_shape); -} - -// NCHW -> NCHW only -template -void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, - Tensor& output, size_t stride_, size_t pads_, size_t group_, - size_t dilations_) { - size_t batch_size = input.get_shape()[0]; - size_t in_channels = input.get_shape()[1]; - size_t in_height = input.get_shape()[2]; - size_t in_width = input.get_shape()[3]; - - size_t kernel_out_channels = kernel_.get_shape()[0]; - size_t kernel_in_channels = kernel_.get_shape()[1]; - size_t kernel_height = kernel_.get_shape()[2]; - size_t kernel_width = kernel_.get_shape()[3]; - - unsigned num_threads = std::thread::hardware_concurrency(); - std::vector threads; - size_t chunk_size = batch_size / num_threads; - - std::vector>>> padded_input = - std::vector>>>( - batch_size, std::vector>>( - in_height + 2 * pads_, - std::vector>( - in_width + 2 * pads_, - std::vector(in_channels, 0)))); - auto pad_input = [&](size_t start_b, size_t end_b) { - for (size_t b = start_b; b < end_b; ++b) { - for (size_t h = 0; h < in_height; ++h) { - for (size_t w = 0; w < in_width; ++w) { - for (size_t c = 0; c < in_channels; ++c) { - padded_input[b][h + pads_][w + pads_][c] = - input.get({b, c, h, w}); - } - } - } - } - }; - - for (unsigned i = 0; i < num_threads; ++i) { - size_t start = i * chunk_size; - size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size; - threads.emplace_back(pad_input, start, end); - } - for (auto& t : threads) t.join(); - threads.clear(); - std::vector>>> dil_kernel = - std::vector>>>( - kernel_height * dilations_ + 1 - dilations_, - std::vector>>( - kernel_width * dilations_ + 1 - dilations_, - std::vector>( - kernel_in_channels, - std::vector(kernel_out_channels, 0)))); - - auto dilate_kernel = [&](size_t start_oc, size_t end_oc) { - for (size_t oc = start_oc; oc < end_oc; ++oc) { - for (size_t h = 0; h < kernel_height; ++h) { - for (size_t w = 0; w < kernel_width; ++w) { - for (size_t ic = 0; ic < kernel_in_channels; ++ic) { - dil_kernel[h * dilations_][w * dilations_][ic][oc] = - kernel_.get({oc, ic, h, w}); - } - } - } - } - }; - - chunk_size = kernel_out_channels / num_threads; - for (unsigned i = 0; i < num_threads; ++i) { - size_t start = i * chunk_size; - size_t end = - (i == num_threads - 1) ? kernel_out_channels : start + chunk_size; - threads.emplace_back(dilate_kernel, start, end); - } - for (auto& t : threads) t.join(); - threads.clear(); - - size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_, - pads_, dilations_); - size_t out_width = - ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_); + size_t total_work = batch_size * out_channels; + parallel::parallel_for( + total_work, + [&](size_t idx) { + size_t b = idx / out_channels; + size_t oc = idx % out_channels; - std::vector>>> output_tensor( - batch_size, std::vector>>( - kernel_out_channels, - std::vector>( - out_height, std::vector(out_width, 0)))); - - auto compute_conv = [&](size_t start_oc, size_t end_oc) { - size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_; - size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_; - - for (size_t b = 0; b < batch_size; ++b) { - for (size_t oc = start_oc; oc < end_oc; ++oc) { - for (size_t oh = 0; oh < out_height; oh++) { - for (size_t ow = 0; ow < out_width; ow++) { + for (size_t oh = 0; oh < out_height; ++oh) { + for (size_t ow = 0; ow < out_width; ++ow) { ValueType value = 0; + size_t h_start = oh * stride_; + size_t w_start = ow * stride_; - size_t group = - (group_ > 1) ? oc / (kernel_out_channels / group_) : 0; + size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0; size_t group_start_channel = group * (in_channels / group_); size_t group_end_channel = (group + 1) * (in_channels / group_); @@ -397,72 +259,54 @@ void Conv4DSTL(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, for (size_t kh = 0; kh < dilated_kernel_height; ++kh) { for (size_t kw = 0; kw < dilated_kernel_width; ++kw) { - size_t h_index = oh * stride_ + kh; - size_t w_index = ow * stride_ + kw; + size_t h_index = h_start + kh; + size_t w_index = w_start + kw; if (h_index < padded_input[b].size() && w_index < padded_input[b][h_index].size()) { value += padded_input[b][h_index][w_index][ic] * - dil_kernel[kh][kw][kernel_ic][oc]; + dil_kernel[oc][kernel_ic][kh][kw]; } } } } - if (!bias_.empty()) { - output_tensor[b][oc][oh][ow] = - value + (*bias_.as())[oc]; - } else { - output_tensor[b][oc][oh][ow] = value; + if (!bias_.empty() && oc < bias_.get_shape()[0]) { + value += bias_.get({oc}); } + + output_tensor[b][oc][oh][ow] = value; } } - } - } - }; - - chunk_size = kernel_out_channels / num_threads; - for (unsigned i = 0; i < num_threads; ++i) { - size_t start = i * chunk_size; - size_t end = - (i == num_threads - 1) ? kernel_out_channels : start + chunk_size; - threads.emplace_back(compute_conv, start, end); - } - for (auto& t : threads) t.join(); - threads.clear(); + }, + options); - Shape sh({batch_size, kernel_out_channels, out_height, out_width}); - std::vector one_d_vector(batch_size * out_height * out_width * - kernel_out_channels); + Shape output_shape({batch_size, out_channels, out_height, out_width}); + std::vector flat_output(batch_size * out_channels * out_height * + out_width); - auto flatten_output = [&](size_t start_b, size_t end_b) { - size_t index_1d = start_b * kernel_out_channels * out_height * out_width; - for (size_t i = start_b; i < end_b; ++i) { - for (size_t l = 0; l < kernel_out_channels; ++l) { - for (size_t j = 0; j < out_height; ++j) { - for (size_t k = 0; k < out_width; ++k) { - one_d_vector[index_1d++] = output_tensor[i][l][j][k]; + parallel::parallel_for( + batch_size, + [&](size_t b) { + size_t base_idx = b * out_channels * out_height * out_width; + for (size_t oc = 0; oc < out_channels; ++oc) { + for (size_t h = 0; h < out_height; ++h) { + for (size_t w = 0; w < out_width; ++w) { + flat_output[base_idx++] = output_tensor[b][oc][h][w]; + } } } - } - } - }; - - chunk_size = batch_size / num_threads; - for (unsigned i = 0; i < num_threads; ++i) { - size_t start = i * chunk_size; - size_t end = (i == num_threads - 1) ? batch_size : start + chunk_size; - threads.emplace_back(flatten_output, start, end); - } - for (auto& t : threads) t.join(); + }, + options); - output = make_tensor(one_d_vector, sh); + output = make_tensor(flat_output, output_shape); } template void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, Tensor& output, size_t stride_, - size_t pads_, size_t dilations_) { + size_t pads_, size_t dilations_, + ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; size_t channels = input.get_shape()[1]; size_t in_height = input.get_shape()[2]; @@ -485,44 +329,55 @@ void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_, Tensor output_tensor(Shape({batch_size, channels, out_height, out_width}), input.get_type()); - for (size_t b = 0; b < batch_size; ++b) { - for (size_t c = 0; c < channels; ++c) { - for (size_t oh = 0; oh < out_height; ++oh) { - for (size_t ow = 0; ow < out_width; ++ow) { - ValueType sum = 0; + parallel::Options options; + options.backend = backend; - for (size_t kh = 0; kh < kernel_height; ++kh) { - for (size_t kw = 0; kw < kernel_width; ++kw) { - size_t ih = oh * stride_ + kh * dilations_ - pads_; - size_t iw = ow * stride_ + kw * dilations_ - pads_; + size_t total_work = batch_size * channels; - if (ih < in_height && iw < in_width) { - auto input_val = input.get({b, c, ih, iw}); - auto kernel_val = kernel_.get({c, 0, kh, kw}); + parallel::parallel_for( + total_work, + [&](size_t idx) { + size_t b = idx / channels; + size_t c = idx % channels; - sum += input_val * kernel_val; + for (size_t oh = 0; oh < out_height; ++oh) { + for (size_t ow = 0; ow < out_width; ++ow) { + ValueType sum = 0; + + for (size_t kh = 0; kh < kernel_height; ++kh) { + for (size_t kw = 0; kw < kernel_width; ++kw) { + size_t ih = oh * stride_ + kh * dilations_; + size_t iw = ow * stride_ + kw * dilations_; + + if (ih >= pads_ && iw >= pads_ && (ih - pads_) < in_height && + (iw - pads_) < in_width) { + auto input_val = + input.get({b, c, ih - pads_, iw - pads_}); + auto kernel_val = kernel_.get({c, 0, kh, kw}); + sum += input_val * kernel_val; + } } } - } - if (!bias_.empty() && c < bias_.get_shape()[0]) { - sum += bias_.get({c}); - } + if (!bias_.empty() && c < bias_.get_shape()[0]) { + sum += bias_.get({c}); + } - output_tensor.set({b, c, oh, ow}, sum); + output_tensor.set({b, c, oh, ow}, sum); + } } - } - } - } + }, + options); output = output_tensor; } -// NCHW -> NCHW only +// NCHW -> NCHW only (Legacy version) template void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_, const Tensor& bias_, Tensor& output, size_t stride_, - size_t pads_, size_t dilations_) { + size_t pads_, size_t dilations_, + ParBackend backend = ParBackend::kSeq) { size_t batch_size = input.get_shape()[0]; size_t in_height = input.get_shape()[2]; size_t in_width = input.get_shape()[3]; @@ -533,41 +388,54 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_, size_t kernel_in_channels = kernel_.get_shape()[2]; size_t kernel_out_channels = kernel_.get_shape()[3]; - std::vector>>> padded_input = - std::vector>>>( - batch_size, std::vector>>( - in_height + 2 * pads_, - std::vector>( - in_width + 2 * pads_, - std::vector(in_channels, 0)))); - for (size_t b = 0; b < batch_size; ++b) { - for (size_t h = 0; h < in_height; ++h) { - for (size_t w = 0; w < in_width; ++w) { - for (size_t c = 0; c < in_channels; ++c) { - padded_input[b][h + pads_][w + pads_][c] = - input.get({b, c, h, w}); + parallel::Options options; + options.backend = backend; + + std::vector>>> padded_input( + batch_size, + std::vector>>( + in_height + 2 * pads_, + std::vector>( + in_width + 2 * pads_, std::vector(in_channels, 0)))); + + parallel::parallel_for( + batch_size, + [&](size_t b) { + for (size_t h = 0; h < in_height; ++h) { + for (size_t w = 0; w < in_width; ++w) { + for (size_t c = 0; c < in_channels; ++c) { + padded_input[b][h + pads_][w + pads_][c] = + input.get({b, c, h, w}); + } + } } - } - } - } - std::vector>>> dil_kernel = - std::vector>>>( - kernel_height * dilations_ + 1 - dilations_, - std::vector>>( - kernel_width * dilations_ + 1 - dilations_, - std::vector>( - kernel_in_channels, - std::vector(kernel_out_channels, 0)))); - for (size_t b = 0; b < kernel_out_channels; ++b) { - for (size_t h = 0; h < kernel_height; ++h) { - for (size_t w = 0; w < kernel_width; ++w) { - for (size_t c = 0; c < kernel_in_channels; ++c) { - dil_kernel[h * dilations_][w * dilations_][c][b] = - kernel_.get({h, w, c, b}); + }, + options); + + size_t dilated_kernel_height = kernel_height * dilations_ + 1 - dilations_; + size_t dilated_kernel_width = kernel_width * dilations_ + 1 - dilations_; + + std::vector>>> dil_kernel( + dilated_kernel_height, + std::vector>>( + dilated_kernel_width, + std::vector>( + kernel_in_channels, + std::vector(kernel_out_channels, 0)))); + + parallel::parallel_for( + kernel_out_channels, + [&](size_t b) { + for (size_t h = 0; h < kernel_height; ++h) { + for (size_t w = 0; w < kernel_width; ++w) { + for (size_t c = 0; c < kernel_in_channels; ++c) { + dil_kernel[h * dilations_][w * dilations_][c][b] = + kernel_.get({h, w, c, b}); + } + } } - } - } - } + }, + options); size_t out_height = ComputeConvOutputDim(in_height, kernel_height, stride_, pads_, dilations_); @@ -579,44 +447,55 @@ void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_, kernel_out_channels, std::vector>( out_height, std::vector(out_width, 0)))); - for (size_t b = 0; b < batch_size; ++b) { - for (size_t c = 0; c < kernel_out_channels; ++c) { - for (size_t i = 0; i < out_height; i += stride_) { - for (size_t j = 0; j < out_width; j += stride_) { - ValueType value = 0; - for (size_t ic = 0; ic < in_channels; ++ic) { - for (size_t h = 0; h < kernel_height * dilations_ + 1 - dilations_; - ++h) { - for (size_t w = 0; w < kernel_width * dilations_ + 1 - dilations_; - ++w) { - value += - padded_input[b][i + h][j + w][ic] * dil_kernel[h][w][ic][c]; + + size_t total_work = batch_size * kernel_out_channels; + + parallel::parallel_for( + total_work, + [&](size_t idx) { + size_t b = idx / kernel_out_channels; + size_t c = idx % kernel_out_channels; + + for (size_t i = 0; i < out_height; i += stride_) { + for (size_t j = 0; j < out_width; j += stride_) { + ValueType value = 0; + for (size_t ic = 0; ic < in_channels; ++ic) { + for (size_t h = 0; h < dilated_kernel_height; ++h) { + for (size_t w = 0; w < dilated_kernel_width; ++w) { + value += padded_input[b][i + h][j + w][ic] * + dil_kernel[h][w][ic][c]; + } } } - } - if (!bias_.empty()) { - output_tensor[b][c][i][j] = value + (*bias_.as())[c]; - } else { - output_tensor[b][c][i][j] = value; + if (!bias_.empty()) { + output_tensor[b][c][i][j] = value + (*bias_.as())[c]; + } else { + output_tensor[b][c][i][j] = value; + } } } - } - } - } + }, + options); Shape sh({batch_size, kernel_out_channels, out_height, out_width}); std::vector one_d_vector(batch_size * out_height * out_width * kernel_out_channels); - size_t index_1d = 0; - for (size_t i = 0; i < batch_size; ++i) { - for (size_t l = 0; l < kernel_out_channels; ++l) { - for (size_t j = 0; j < out_height; ++j) { - for (size_t k = 0; k < out_width; ++k) { - one_d_vector[index_1d++] = output_tensor[i][l][j][k]; + + parallel::parallel_for( + batch_size, + [&](size_t i) { + size_t base_idx = i * kernel_out_channels * out_height * out_width; + for (size_t l = 0; l < kernel_out_channels; ++l) { + for (size_t j = 0; j < out_height; ++j) { + for (size_t k = 0; k < out_width; ++k) { + one_d_vector[base_idx++] = output_tensor[i][l][j][k]; + } + } } - } - } - } + }, + options); + output = make_tensor(one_d_vector, sh); } + } // namespace it_lab_ai diff --git a/src/layers/ConvLayer.cpp b/src/layers/ConvLayer.cpp index 60bce718..63d7ebad 100644 --- a/src/layers/ConvLayer.cpp +++ b/src/layers/ConvLayer.cpp @@ -17,16 +17,19 @@ void ConvolutionalLayer::run(const std::vector& input, if (input[0].get_shape().dims() != 4) { throw std::out_of_range("input must be 4-dimensional"); } + + ParBackend backend = options.getEffectiveParBackend(); + if (group_ > 1) { if (group_ == input[0].get_shape()[1] && group_ == kernel_.get_shape()[0]) { switch (input[0].get_type()) { case Type::kFloat: DepthwiseConv4D(input[0], kernel_, bias_, output[0], stride_, - pads_, dilations_); + pads_, dilations_, backend); break; case Type::kInt: DepthwiseConv4D(input[0], kernel_, bias_, output[0], stride_, - pads_, dilations_); + pads_, dilations_, backend); break; default: throw std::runtime_error( @@ -35,17 +38,7 @@ void ConvolutionalLayer::run(const std::vector& input, return; } } - if (options.parallel) { - switch (options.par_backend) { - case ParBackend::kThreads: - implType_ = kSTL; - break; - case ParBackend::kSeq: - default: - implType_ = kDefault; - break; - } - } + switch (input[0].get_type()) { case Type::kInt: { if (kernel_.get_shape().dims() == 2) { @@ -96,18 +89,8 @@ void ConvolutionalLayer::run(const std::vector& input, 2)), sh); } else { - switch (implType_) { - case kSTL: { - Conv4DSTL(input[0], kernel_, bias_, output[0], stride_, pads_, - group_, dilations_); - break; - } - default: { - Conv4D(input[0], kernel_, bias_, output[0], stride_, pads_, - group_, dilations_); - break; - } - } + Conv4D(input[0], kernel_, bias_, output[0], stride_, pads_, group_, + dilations_, backend); } break; } @@ -162,26 +145,16 @@ void ConvolutionalLayer::run(const std::vector& input, } else { if (useLegacyImpl_) { Conv4D_Legacy(input[0], kernel_, bias_, output[0], stride_, - pads_, dilations_); + pads_, dilations_, backend); } else { - switch (implType_) { - case kSTL: { - Conv4DSTL(input[0], kernel_, bias_, output[0], stride_, - pads_, group_, dilations_); - break; - } - default: { - Conv4D(input[0], kernel_, bias_, output[0], stride_, pads_, - group_, dilations_); - break; - } - } + Conv4D(input[0], kernel_, bias_, output[0], stride_, pads_, + group_, dilations_, backend); } - break; - } - default: { - throw std::runtime_error("Unsupported tensor type"); } + break; + } + default: { + throw std::runtime_error("Unsupported tensor type"); } } } diff --git a/test/single_layer/test_convlayer.cpp b/test/single_layer/test_convlayer.cpp index 664ac313..0ffbd6fa 100644 --- a/test/single_layer/test_convlayer.cpp +++ b/test/single_layer/test_convlayer.cpp @@ -388,7 +388,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatWithGroups) { std::vector output_vec(16, 0.0f); Tensor output = make_tensor(output_vec, output_shape); - Conv4DSTL(input, kernel, Tensor(), output, 1, 0, 2, 1); + Conv4D(input, kernel, Tensor(), output, 1, 0, 2, 1, + ParBackend::kThreads); std::vector result = *output.as(); @@ -423,7 +424,7 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatComplex) { std::vector output_vec(8, 0.0f); Tensor output = make_tensor(output_vec, output_shape); - Conv4DSTL(input, kernel, bias, output, 1, 0, 1, 1); + Conv4D(input, kernel, bias, output, 1, 0, 1, 1, ParBackend::kThreads); std::vector result = *output.as(); @@ -495,7 +496,7 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatBasic) { std::vector output_vec(8, 0.0f); Tensor output = make_tensor(output_vec, output_shape); - Conv4DSTL(input, kernel, bias, output, 1, 0, 1, 1); + Conv4D(input, kernel, bias, output, 1, 0, 1, 1, ParBackend::kThreads); std::vector result = *output.as(); @@ -523,7 +524,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatWithPaddingAndStride) { 0.0f); Tensor output = make_tensor(output_vec, output_shape); - Conv4DSTL(input, kernel, Tensor(), output, 2, 1, 1, 1); + Conv4D(input, kernel, Tensor(), output, 2, 1, 1, 1, + ParBackend::kThreads); std::vector result = *output.as(); @@ -547,7 +549,8 @@ TEST(ConvolutionalLayerTest, Conv4DSTLFloatCompareWithConv4D) { Shape output_shape2({1, 1, 1, 1}); std::vector output_vec2(1, 0.0f); Tensor output2 = make_tensor(output_vec2, output_shape2); - Conv4DSTL(input, kernel, Tensor(), output2, 1, 0, 1, 1); + Conv4D(input, kernel, Tensor(), output2, 1, 0, 1, 1, + ParBackend::kThreads); float result1 = (*output1.as())[0]; float result2 = (*output2.as())[0]; diff --git a/test/single_layer_parall_version/test_convlayer_parall.cpp b/test/single_layer_parall_version/test_convlayer_parall.cpp new file mode 100644 index 00000000..c2420eae --- /dev/null +++ b/test/single_layer_parall_version/test_convlayer_parall.cpp @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "layers/ConvLayer.hpp" + +#define ENABLE_TIMING_OUTPUT 1 + +#if ENABLE_TIMING_OUTPUT +#define PRINT_TIMING(msg) std::cout << msg << std::endl +#else +#define PRINT_TIMING(msg) ((void)0) +#endif + +using namespace it_lab_ai; + +TEST(convlayer_parall, parallel_conv_basic) { + size_t batch_size = 32; + std::vector image(batch_size * 3 * 224 * 224, 1.0f); + Shape input_shape({batch_size, 3, 224, 224}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(64 * 3 * 3 * 3, 1.0f); + Shape kernel_shape({64, 3, 3, 3}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + size_t out_height = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + size_t out_width = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + Shape output_shape({batch_size, 64, out_height, out_width}); + std::vector output_vec(batch_size * 64 * out_height * out_width, 0.0f); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(1, 1, 1, kernel); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + EXPECT_EQ(out[0].get_shape()[1], 64); + } +} + +TEST(convlayer_parall, parallel_conv_stride2) { + size_t batch_size = 64; + std::vector image(batch_size * 16 * 112 * 112, 1.0f); + Shape input_shape({batch_size, 16, 112, 112}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(32 * 16 * 3 * 3, 1.0f); + Shape kernel_shape({32, 16, 3, 3}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + size_t out_height = (112 + 2 * 1 - 1 * (3 - 1) - 1) / 2 + 1; + size_t out_width = (112 + 2 * 1 - 1 * (3 - 1) - 1) / 2 + 1; + Shape output_shape({batch_size, 32, out_height, out_width}); + std::vector output_vec(batch_size * 32 * out_height * out_width, 0.0f); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(2, 1, 1, kernel); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + EXPECT_EQ(out[0].get_shape()[2], out_height); + EXPECT_EQ(out[0].get_shape()[3], out_width); + } +} + +TEST(convlayer_parall, parallel_depthwise_conv) { + size_t batch_size = 128; + std::vector image(batch_size * 32 * 56 * 56, 1.0f); + Shape input_shape({batch_size, 32, 56, 56}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(32 * 1 * 3 * 3, 1.0f); + Shape kernel_shape({32, 1, 3, 3}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + size_t out_height = (56 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + size_t out_width = (56 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + Shape output_shape({batch_size, 32, out_height, out_width}); + std::vector output_vec(batch_size * 32 * out_height * out_width, 0.0f); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(1, 1, 1, kernel, Tensor(), 32); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("Depthwise Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + EXPECT_EQ(out[0].get_shape()[1], 32); + } +} + +TEST(convlayer_parall, parallel_conv_with_bias) { + size_t batch_size = 16; + std::vector image(batch_size * 16 * 28 * 28, 1); + Shape input_shape({batch_size, 16, 28, 28}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(36 * 16 * 5 * 5, 1); + Shape kernel_shape({36, 16, 5, 5}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + std::vector biasvec(36, 10); + Tensor bias = make_tensor(biasvec, Shape({36})); + + size_t pads = (kernel.get_shape()[2] - 1) / 2; + size_t out_height = (28 + 2 * pads - 1 * (5 - 1) - 1) / 1 + 1; + size_t out_width = (28 + 2 * pads - 1 * (5 - 1) - 1) / 1 + 1; + Shape output_shape({batch_size, 36, out_height, out_width}); + std::vector output_vec(batch_size * 36 * out_height * out_width, 0); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(1, pads, 1, kernel, bias); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + std::vector result = *out[0].as(); + EXPECT_GT(result[0], 0); + } +} + +TEST(convlayer_parall, parallel_conv_large_kernel) { + size_t batch_size = 8; + std::vector image(batch_size * 3 * 128 * 128, 1.0f); + Shape input_shape({batch_size, 3, 128, 128}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(16 * 3 * 7 * 7, 1.0f); + Shape kernel_shape({16, 3, 7, 7}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + size_t pads = 3; + size_t out_height = (128 + 2 * pads - 1 * (7 - 1) - 1) / 2 + 1; + size_t out_width = (128 + 2 * pads - 1 * (7 - 1) - 1) / 2 + 1; + Shape output_shape({batch_size, 16, out_height, out_width}); + std::vector output_vec(batch_size * 16 * out_height * out_width, 0.0f); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(2, pads, 1, kernel); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("7x7 Kernel Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + EXPECT_EQ(out[0].get_shape()[1], 16); + } +} + +TEST(convlayer_parall, parallel_conv_single_image) { + size_t batch_size = 1; + std::vector image(batch_size * 3 * 224 * 224, 1.0f); + Shape input_shape({batch_size, 3, 224, 224}); + Tensor input = make_tensor(image, input_shape); + + std::vector kernelvec(64 * 3 * 3 * 3, 1.0f); + Shape kernel_shape({64, 3, 3, 3}); + Tensor kernel = make_tensor(kernelvec, kernel_shape); + + size_t out_height = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + size_t out_width = (224 + 2 * 1 - 1 * (3 - 1) - 1) / 1 + 1; + Shape output_shape({batch_size, 64, out_height, out_width}); + std::vector output_vec(batch_size * 64 * out_height * out_width, 0.0f); + Tensor output = make_tensor(output_vec, output_shape); + + ConvolutionalLayer layer(1, 1, 1, kernel); + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING("Single Image Backend " << static_cast(backend) + << " time: " << duration.count() + << " ms (batch=" << batch_size << ")"); + + EXPECT_EQ(out[0].get_shape()[0], batch_size); + EXPECT_EQ(out[0].get_shape()[1], 64); + } +} \ No newline at end of file