Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 90 additions & 92 deletions include/layers/ConvLayer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include <cmath>
#include <cstddef>
#include <stdexcept>
#include <vector>

Expand Down Expand Up @@ -34,7 +35,7 @@ class ConvolutionalLayer : public Layer {
dilations_ = 0;
}
ConvolutionalLayer(size_t step, size_t pads, size_t dilations,
const Tensor& kernel, const Tensor& bias = Tensor(),
const Tensor &kernel, const Tensor &bias = Tensor(),
size_t group = 1, bool useLegacyImpl = false)
: Layer(kConvolution),
kernel_(std::make_shared<Tensor>(kernel)),
Expand Down Expand Up @@ -73,10 +74,10 @@ class ConvolutionalLayer : public Layer {
return useLegacyImpl_;
}

void run(const std::vector<Tensor>& input,
std::vector<Tensor>& output) override;
void run(const std::vector<Tensor>& input, std::vector<Tensor>& output,
const RuntimeOptions& options) override;
void run(const std::vector<Tensor> &input,
std::vector<Tensor> &output) override;
void run(const std::vector<Tensor> &input, std::vector<Tensor> &output,
const RuntimeOptions &options) override;
#ifdef ENABLE_STATISTIC_WEIGHTS
Tensor get_weights() override {
return *kernel_;
Expand All @@ -100,7 +101,7 @@ class ConvImpl : public LayerImpl<ValueType> {
ConvImpl() = delete;
ConvImpl(size_t stride, size_t pads, size_t dilations, int input_width,
int input_height, int input_flow, size_t input_size,
const std::vector<ValueType>& bias)
const std::vector<ValueType> &bias)
: input_width_(input_width),
input_height_(input_height),
input_flow_(input_flow),
Expand All @@ -110,10 +111,10 @@ class ConvImpl : public LayerImpl<ValueType> {
input_size_(input_size),
bias_(bias) {}

ConvImpl(const ConvImpl& c) = default;
ConvImpl(const ConvImpl &c) = default;

[[nodiscard]] std::vector<ValueType> run(
const std::vector<ValueType>& input) const override {
const std::vector<ValueType> &input) const override {
return input;
}

Expand Down Expand Up @@ -184,8 +185,8 @@ class ConvImpl : public LayerImpl<ValueType> {

// NCHW -> NCHW only
template <typename ValueType>
void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
Tensor& output, size_t stride_, size_t pads_, size_t group_,
void Conv4D(const Tensor &input, const Tensor &kernel_, const Tensor &bias_,
Tensor &output, size_t stride_, size_t pads_, size_t group_,
size_t dilations_, ParBackend backend = ParBackend::kSeq) {
size_t batch_size = input.get_shape()[0];
size_t in_channels = input.get_shape()[1];
Expand All @@ -212,106 +213,103 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
size_t out_width =
ComputeConvOutputDim(in_width, kernel_width, stride_, pads_, dilations_);

std::vector<std::vector<std::vector<std::vector<ValueType>>>> padded_input(
batch_size,
std::vector<std::vector<std::vector<ValueType>>>(
in_height + 2 * pads_,
std::vector<std::vector<ValueType>>(
in_width + 2 * pads_, std::vector<ValueType>(in_channels, 0))));

parallel::Options options;
options.backend = backend;

parallel::parallel_for(batch_size, [&](size_t b) {
for (size_t h = 0; h < in_height; ++h) {
for (size_t w = 0; w < in_width; ++w) {
for (size_t c = 0; c < in_channels; ++c) {
padded_input[b][h + pads_][w + pads_][c] =
input.get<ValueType>({b, c, h, w});
}
}
}
}, options);

size_t dilated_kernel_height = (kernel_height - 1) * dilations_ + 1;
size_t dilated_kernel_width = (kernel_width - 1) * dilations_ + 1;

std::vector<std::vector<std::vector<std::vector<ValueType>>>> dil_kernel(
out_channels, std::vector<std::vector<std::vector<ValueType>>>(
kernel_in_channels,
std::vector<std::vector<ValueType>>(
dilated_kernel_height,
std::vector<ValueType>(dilated_kernel_width, 0))));

parallel::parallel_for(out_channels, [&](size_t oc) {
for (size_t ic = 0; ic < kernel_in_channels; ++ic) {
for (size_t kh = 0; kh < kernel_height; ++kh) {
for (size_t kw = 0; kw < kernel_width; ++kw) {
dil_kernel[oc][ic][kh * dilations_][kw * dilations_] =
kernel_.get<ValueType>({oc, ic, kh, kw});
}
}
}
}, options);
const auto &input_data = *input.as<ValueType>();
const auto &kernel_data = *kernel_.as<ValueType>();
const std::vector<ValueType> *bias_data = nullptr;
if (!bias_.empty()) {
bias_data = bias_.as<ValueType>();
}

std::vector<std::vector<std::vector<std::vector<ValueType>>>> output_tensor(
batch_size,
std::vector<std::vector<std::vector<ValueType>>>(
out_channels, std::vector<std::vector<ValueType>>(
out_height, std::vector<ValueType>(out_width, 0))));
const size_t input_channel_stride = in_height * in_width;
const size_t input_batch_stride = in_channels * input_channel_stride;
const size_t kernel_channel_stride = kernel_height * kernel_width;
const size_t kernel_output_stride =
kernel_in_channels * kernel_channel_stride;
const size_t output_channel_stride = out_height * out_width;
const size_t output_batch_stride = out_channels * output_channel_stride;
const size_t in_channels_per_group = in_channels / group_;
const size_t out_channels_per_group = out_channels / group_;
const bool collapsed_kernel = dilations_ == 0;

Shape output_shape({batch_size, out_channels, out_height, out_width});
std::vector<ValueType> flat_output(output_shape.count(), 0);
size_t total_work = batch_size * out_channels;
parallel::parallel_for(total_work, [&](size_t idx) {
size_t b = idx / out_channels;
size_t oc = idx % out_channels;
size_t input_batch_base = b * input_batch_stride;
size_t output_base = b * output_batch_stride + oc * output_channel_stride;
size_t group = (group_ > 1) ? oc / out_channels_per_group : 0;
size_t group_start_channel = group * in_channels_per_group;
size_t group_end_channel = group_start_channel + in_channels_per_group;
size_t kernel_oc_base = oc * kernel_output_stride;
auto bias_value = ValueType{};
if (bias_data != nullptr && oc < bias_data->size()) {
bias_value = (*bias_data)[oc];
}

for (size_t oh = 0; oh < out_height; ++oh) {
std::ptrdiff_t input_h_base = static_cast<std::ptrdiff_t>(oh * stride_) -
static_cast<std::ptrdiff_t>(pads_);
for (size_t ow = 0; ow < out_width; ++ow) {
ValueType value = 0;
size_t h_start = oh * stride_;
size_t w_start = ow * stride_;

size_t group = (group_ > 1) ? oc / (out_channels / group_) : 0;
size_t group_start_channel = group * (in_channels / group_);
size_t group_end_channel = (group + 1) * (in_channels / group_);
ValueType value = bias_value;
std::ptrdiff_t input_w_base =
static_cast<std::ptrdiff_t>(ow * stride_) -
static_cast<std::ptrdiff_t>(pads_);
size_t output_idx = output_base + oh * out_width + ow;

for (size_t ic = group_start_channel; ic < group_end_channel; ++ic) {
size_t kernel_ic = ic - group_start_channel;

for (size_t kh = 0; kh < dilated_kernel_height; ++kh) {
for (size_t kw = 0; kw < dilated_kernel_width; ++kw) {
size_t h_index = h_start + kh;
size_t w_index = w_start + kw;

if (h_index < padded_input[b].size() &&
w_index < padded_input[b][h_index].size()) {
value += padded_input[b][h_index][w_index][ic] *
dil_kernel[oc][kernel_ic][kh][kw];
}
size_t input_channel_base =
input_batch_base + ic * input_channel_stride;
size_t kernel_ic_base =
kernel_oc_base + kernel_ic * kernel_channel_stride;

if (collapsed_kernel) {
if (input_h_base >= 0 &&
input_h_base < static_cast<std::ptrdiff_t>(in_height) &&
input_w_base >= 0 &&
input_w_base < static_cast<std::ptrdiff_t>(in_width)) {
size_t input_idx = input_channel_base +
static_cast<size_t>(input_h_base) * in_width +
static_cast<size_t>(input_w_base);
size_t kernel_idx = kernel_ic_base + kernel_channel_stride - 1;
value += input_data[input_idx] * kernel_data[kernel_idx];
}
continue;
}
}

if (!bias_.empty() && oc < bias_.get_shape()[0]) {
value += bias_.get<ValueType>({oc});
}
for (size_t kh = 0; kh < kernel_height; ++kh) {
std::ptrdiff_t input_h =
input_h_base + static_cast<std::ptrdiff_t>(kh * dilations_);
if (input_h < 0 ||
input_h >= static_cast<std::ptrdiff_t>(in_height)) {
continue;
}

output_tensor[b][oc][oh][ow] = value;
}
}
}, options);
size_t input_row_base =
input_channel_base + static_cast<size_t>(input_h) * in_width;
size_t kernel_row_base = kernel_ic_base + kh * kernel_width;

Shape output_shape({batch_size, out_channels, out_height, out_width});
std::vector<ValueType> flat_output(batch_size * out_channels * out_height *
out_width);
for (size_t kw = 0; kw < kernel_width; ++kw) {
std::ptrdiff_t input_w =
input_w_base + static_cast<std::ptrdiff_t>(kw * dilations_);
if (input_w < 0 ||
input_w >= static_cast<std::ptrdiff_t>(in_width)) {
continue;
}

parallel::parallel_for(batch_size, [&](size_t b) {
size_t base_idx = b * out_channels * out_height * out_width;
for (size_t oc = 0; oc < out_channels; ++oc) {
for (size_t h = 0; h < out_height; ++h) {
for (size_t w = 0; w < out_width; ++w) {
flat_output[base_idx++] = output_tensor[b][oc][h][w];
value +=
input_data[input_row_base + static_cast<size_t>(input_w)] *
kernel_data[kernel_row_base + kw];
}
}
}

flat_output[output_idx] = value;
}
}
}, options);
Expand All @@ -320,8 +318,8 @@ void Conv4D(const Tensor& input, const Tensor& kernel_, const Tensor& bias_,
}

template <typename ValueType>
void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,
const Tensor& bias_, Tensor& output, size_t stride_,
void DepthwiseConv4D(const Tensor &input, const Tensor &kernel_,
const Tensor &bias_, Tensor &output, size_t stride_,
size_t pads_, size_t dilations_,
ParBackend backend = ParBackend::kSeq) {
size_t batch_size = input.get_shape()[0];
Expand Down Expand Up @@ -388,8 +386,8 @@ void DepthwiseConv4D(const Tensor& input, const Tensor& kernel_,

// NCHW -> NCHW only (Legacy version)
template <typename ValueType>
void Conv4D_Legacy(const Tensor& input, const Tensor& kernel_,
const Tensor& bias_, Tensor& output, size_t stride_,
void Conv4D_Legacy(const Tensor &input, const Tensor &kernel_,
const Tensor &bias_, Tensor &output, size_t stride_,
size_t pads_, size_t dilations_,
ParBackend backend = ParBackend::kSeq) {
size_t batch_size = input.get_shape()[0];
Expand Down
Loading