// Copyright 2018 Xiaomi, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_ #define MACE_KERNELS_DEPTHWISE_CONV2D_H_ #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #include #endif #include #include #include #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/activation.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template void DepthwiseConv2dKernel(const T *input_ptr, const T *filter_ptr, const T *bias_ptr, T *output_ptr, int batch, int height, int width, int channels, int input_height, int input_width, int input_channels, int multiplier, int padded_h_start, int padded_h_stop, int padded_w_start, int padded_w_stop, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w, int h_start, int h_stop, int w_start, int w_stop) { #pragma omp parallel for collapse(4) for (int n = 0; n < batch; ++n) { for (int h = h_start; h < h_stop; ++h) { for (int w = w_start; w < w_stop; ++w) { for (int c = 0; c < channels; ++c) { const index_t inc = c / multiplier; const index_t m = c % multiplier; T bias_channel = bias_ptr ? bias_ptr[c] : 0; index_t offset = n * height * width * channels + h * width * channels + w * channels + c; output_ptr[offset] = bias_channel; T sum = 0; const T *filter_base = filter_ptr + inc * multiplier + m; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh < 0 || inh >= input_height || inw < 0 || inw >= input_width) { MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop && inw >= padded_w_start && inw < padded_w_stop, "Out of range read from input: ", padded_h_start, " <= ", inh, " < ", padded_h_stop, ", ", padded_w_start, " <= ", inw, " < ", padded_w_stop); } else { index_t input_offset = n * input_height * input_width * input_channels + inh * input_width * input_channels + inw * input_channels + inc; sum += input_ptr[input_offset] * filter_base[0]; // HWIM } filter_base += input_channels * multiplier; } } output_ptr[offset] += sum; } } } } } template void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr, const T *filter_ptr, const T *bias_ptr, T *output_ptr, int batch, int height, int width, int channels, int input_height, int input_width, int input_channels, int multiplier, int padded_h_start, int padded_h_stop, int padded_w_start, int padded_w_stop, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w, int h_start, int h_stop, int w_start, int w_stop) { if (multiplier == 1) { constexpr int c_tile_size = 4; #pragma omp parallel for collapse(3) for (int n = 0; n < batch; ++n) { for (int h = h_start; h < h_stop; ++h) { for (int w = w_start; w < w_stop; ++w) { int c; for (c = 0; c + c_tile_size <= channels; c += c_tile_size) { #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) static_assert(c_tile_size == 4, "channels tile size must be 4"); float32x4_t sum = vdupq_n_f32(0); if (bias_ptr != nullptr) { sum = vld1q_f32(bias_ptr + c); } #else T sum[c_tile_size] = {0}; if (bias_ptr != nullptr) { for (int ci = 0; ci < c_tile_size; ++ci) { sum[ci] = bias_ptr[c + ci]; } } #endif const T *filter_base = filter_ptr + c; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && inw < input_width); index_t input_offset = n * input_height * input_width * input_channels + inh * input_width * input_channels + inw * input_channels + c; #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) float32x4_t in = vld1q_f32(input_ptr + input_offset); float32x4_t weights = vld1q_f32(filter_base); sum = vfmaq_f32(sum, in, weights); #else for (int ci = 0; ci < c_tile_size; ++ci) { sum[ci] += input_ptr[input_offset + ci] * filter_base[ci]; // HWIM } #endif filter_base += input_channels; } } index_t offset = n * height * width * channels + h * width * channels + w * channels + c; #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) vst1q_f32(output_ptr + offset, sum); #else for (int ci = 0; ci < c_tile_size; ++ci) { output_ptr[offset + ci] = sum[ci]; } #endif } for (; c < channels; ++c) { T bias_channel = bias_ptr ? bias_ptr[c] : 0; index_t offset = n * height * width * channels + h * width * channels + w * channels + c; output_ptr[offset] = bias_channel; T sum = 0; const T *filter_base = filter_ptr + c; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && inw < input_width); index_t input_offset = n * input_height * input_width * input_channels + inh * input_width * input_channels + inw * input_channels + c; sum += input_ptr[input_offset] * filter_base[0]; // HWIM filter_base += input_channels * multiplier; } } output_ptr[offset] += sum; } } } } } else { #pragma omp parallel for collapse(4) for (int n = 0; n < batch; ++n) { for (int h = h_start; h < h_stop; ++h) { for (int w = w_start; w < w_stop; ++w) { for (int c = 0; c < channels; ++c) { const index_t inc = c / multiplier; const index_t m = c % multiplier; T bias_channel = bias_ptr ? bias_ptr[c] : 0; index_t offset = n * height * width * channels + h * width * channels + w * channels + c; output_ptr[offset] = bias_channel; T sum = 0; const T *filter_base = filter_ptr + inc * multiplier + m; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && inw < input_width); index_t input_offset = n * input_height * input_width * input_channels + inh * input_width * input_channels + inw * input_channels + inc; sum += input_ptr[input_offset] * filter_base[0]; // HWIM filter_base += input_channels * multiplier; } } output_ptr[offset] += sum; } } } } } } struct DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctorBase(const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) : strides_(strides), padding_type_(padding_type), paddings_(paddings), dilations_(dilations), activation_(activation), relux_max_limit_(relux_max_limit) {} const int *strides_; // [stride_h, stride_w] const Padding padding_type_; std::vector paddings_; const int *dilations_; // [dilation_h, dilation_w] const ActivationType activation_; const float relux_max_limit_; }; template struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctor(const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) : DepthwiseConv2dFunctorBase(strides, padding_type, paddings, dilations, activation, relux_max_limit) {} void operator()(const Tensor *input, // NHWC const Tensor *filter, // HWIM const Tensor *bias, // O Tensor *output, StatsFuture *future) { MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); // Create a fake conv_2d filter to calculate the paddings and output size std::vector fake_filter_shape(4); fake_filter_shape[0] = filter->shape()[0]; fake_filter_shape[1] = filter->shape()[1]; fake_filter_shape[2] = filter->shape()[2] * filter->shape()[3]; fake_filter_shape[3] = 1; std::vector output_shape(4); std::vector paddings(2); if (paddings_.empty()) { kernels::CalcNHWCPaddingAndOutputSize( input->shape().data(), fake_filter_shape.data(), dilations_, strides_, padding_type_, output_shape.data(), paddings.data()); } else { paddings = paddings_; CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(), dilations_, strides_, RoundType::FLOOR, output_shape.data()); } auto input_shape = fake_filter_shape; output->Resize(output_shape); index_t batch = output->dim(0); index_t height = output->dim(1); index_t width = output->dim(2); index_t channels = output->dim(3); index_t input_batch = input->dim(0); index_t input_height = input->dim(1); index_t input_width = input->dim(2); index_t input_channels = input->dim(3); index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); index_t multiplier = filter->dim(3); MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", input_channels); MACE_CHECK(channels == input_channels * multiplier); int stride_h = strides_[0]; int stride_w = strides_[1]; int dilation_h = dilations_[0]; int dilation_w = dilations_[1]; MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); // The left-upper most offset of the padded input int paddings_top = paddings[0] / 2; int paddings_bottom = paddings[0] - paddings_top; int paddings_left = paddings[1] / 2; int paddings_right = paddings[1] - paddings_left; int padded_h_start = 0 - paddings_top; int padded_w_start = 0 - paddings_left; index_t padded_h_stop = input_height + paddings_bottom; index_t padded_w_stop = input_width + paddings_right; Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard bias_mapper(bias); Tensor::MappingGuard output_mapper(output); const T *input_ptr = input->data(); const T *filter_ptr = filter->data(); const T *bias_ptr = bias == nullptr ? nullptr : bias->data(); T *output_ptr = output->mutable_data(); int valid_h_start = paddings_top == 0 ? 0 : (paddings_top - 1) / stride_h + 1; int valid_h_stop = paddings_bottom == 0 ? height : height - ((paddings_bottom - 1) / stride_h + 1); int valid_w_start = paddings_left == 0 ? 0 : (paddings_left - 1) / stride_w + 1; int valid_w_stop = paddings_right == 0 ? width : width - ((paddings_right - 1) / stride_w + 1); // Calculate border elements with out-of-boundary checking if (valid_h_start > 0) { DepthwiseConv2dKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0, valid_h_start, 0, width); } if (valid_h_stop < height) { DepthwiseConv2dKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, std::max(valid_h_start, valid_h_stop), height, 0, width); } if (valid_w_start > 0) { DepthwiseConv2dKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start, valid_h_stop, 0, valid_w_start); } if (valid_w_stop < width) { DepthwiseConv2dKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start, valid_h_stop, std::max(valid_w_start, valid_w_stop), width); } // Calculate border elements without out-of-boundary checking DepthwiseConv2dNoOOBCheckKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start, valid_h_stop, valid_w_start, valid_w_stop); output_ptr = output->mutable_data(); DoActivation(output_ptr, output_ptr, output->size(), activation_, relux_max_limit_); } }; template <> struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctor(const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) : DepthwiseConv2dFunctorBase(strides, padding_type, paddings, dilations, activation, relux_max_limit) {} void operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future); }; template struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctor(const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) : DepthwiseConv2dFunctorBase(strides, padding_type, paddings, dilations, activation, relux_max_limit) {} void operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; std::vector input_shape_; }; } // namespace kernels } // namespace mace #endif // MACE_KERNELS_DEPTHWISE_CONV2D_H_