diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index c2b383013f220cbf8a309a94248ad9c4d24d4b04..d87999f1a1b7defc0caa6280580a7c5704f19587 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -5,6 +5,10 @@ #ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_ #define MACE_KERNELS_DEPTHWISE_CONV2D_H_ +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + #include "mace/core/common.h" #include "mace/core/future.h" #include "mace/core/public/mace.h" @@ -64,7 +68,9 @@ void DepthwiseConv2dKernel(const T *input_ptr, inw >= input_width) { MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop && inw >= padded_w_start && inw < padded_w_stop, - "Out of range read from input: ", inh, ", ", inw); + "Out of range read from input: ", padded_h_start, + " <= ", inh, " < ", padded_h_stop, ", ", + padded_w_start, " <= ", inw, " < ", padded_w_stop); } else { index_t input_offset = n * input_height * input_width * input_channels + @@ -108,33 +114,120 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr, int h_stop, int w_start, int w_stop) { + if (multiplier == 1) { + constexpr int c_tile_size = 4; + +#pragma omp parallel for collapse(3) + for (int n = 0; n < batch; ++n) { + for (int h = h_start; h < h_stop; ++h) { + for (int w = w_start; w < w_stop; ++w) { + int c; + for (c = 0; c + c_tile_size <= channels; c += c_tile_size) { +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + static_assert(c_tile_size == 4, "channels tile size must be 4"); + float32x4_t sum = vdupq_n_f32(0); + if (bias_ptr != nullptr) { + sum = vld1q_f32(bias_ptr + c); + } +#else + T sum[c_tile_size] = {0}; + if (bias_ptr != nullptr) { + for (int ci = 0; ci < c_tile_size; ++ci) { + sum[ci] = bias_ptr[c + ci]; + } + } +#endif + const T *filter_base = filter_ptr + c; + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int inh = padded_h_start + h * stride_h + dilation_h * kh; + int inw = padded_w_start + w * stride_w + dilation_w * kw; + MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && + inw < input_width); + index_t input_offset = + n * input_height * input_width * input_channels + + inh * input_width * input_channels + inw * input_channels + + c; +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + float32x4_t in = vld1q_f32(input_ptr + input_offset); + float32x4_t weights = vld1q_f32(filter_base); + sum = vfmaq_f32(sum, in, weights); +#else + for (int ci = 0; ci < c_tile_size; ++ci) { + sum[ci] += + input_ptr[input_offset + ci] * filter_base[ci]; // HWIM + } +#endif + filter_base += input_channels; + } + } + + index_t offset = n * height * width * channels + + h * width * channels + w * channels + c; +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + vst1q_f32(output_ptr + offset, sum); +#else + for (int ci = 0; ci < c_tile_size; ++ci) { + output_ptr[offset + ci] = sum[ci]; + } +#endif + } + for (; c < channels; ++c) { + T bias_channel = bias_ptr ? bias_ptr[c] : 0; + index_t offset = n * height * width * channels + + h * width * channels + w * channels + c; + output_ptr[offset] = bias_channel; + T sum = 0; + const T *filter_base = filter_ptr + c; + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int inh = padded_h_start + h * stride_h + dilation_h * kh; + int inw = padded_w_start + w * stride_w + dilation_w * kw; + MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && + inw < input_width); + index_t input_offset = + n * input_height * input_width * input_channels + + inh * input_width * input_channels + inw * input_channels + + c; + sum += input_ptr[input_offset] * filter_base[0]; // HWIM + filter_base += input_channels * multiplier; + } + } + output_ptr[offset] += sum; + } + } + } + } + } else { #pragma omp parallel for collapse(4) - for (int n = 0; n < batch; ++n) { - for (int h = h_start; h < h_stop; ++h) { - for (int w = w_start; w < w_stop; ++w) { - for (int c = 0; c < channels; ++c) { - const index_t inc = c / multiplier; - const index_t m = c % multiplier; - T bias_channel = bias_ptr ? bias_ptr[c] : 0; - index_t offset = n * height * width * channels + - h * width * channels + w * channels + c; - output_ptr[offset] = bias_channel; - T sum = 0; - const T *filter_base = filter_ptr + inc * multiplier + m; - for (int kh = 0; kh < kernel_h; ++kh) { - for (int kw = 0; kw < kernel_w; ++kw) { - int inh = padded_h_start + h * stride_h + dilation_h * kh; - int inw = padded_w_start + w * stride_w + dilation_w * kw; - index_t input_offset = - n * input_height * input_width * input_channels + - inh * input_width * input_channels + inw * input_channels + - inc; - // TODO vectorize this - sum += input_ptr[input_offset] * filter_base[0]; // HWIM - filter_base += input_channels * multiplier; + for (int n = 0; n < batch; ++n) { + for (int h = h_start; h < h_stop; ++h) { + for (int w = w_start; w < w_stop; ++w) { + for (int c = 0; c < channels; ++c) { + const index_t inc = c / multiplier; + const index_t m = c % multiplier; + T bias_channel = bias_ptr ? bias_ptr[c] : 0; + index_t offset = n * height * width * channels + + h * width * channels + w * channels + c; + output_ptr[offset] = bias_channel; + T sum = 0; + const T *filter_base = filter_ptr + inc * multiplier + m; + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int inh = padded_h_start + h * stride_h + dilation_h * kh; + int inw = padded_w_start + w * stride_w + dilation_w * kw; + MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 && + inw < input_width); + index_t input_offset = + n * input_height * input_width * input_channels + + inh * input_width * input_channels + inw * input_channels + + inc; + sum += input_ptr[input_offset] * filter_base[0]; // HWIM + filter_base += input_channels * multiplier; + } } + output_ptr[offset] += sum; } - output_ptr[offset] += sum; } } } @@ -230,10 +323,15 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); // The left-upper most offset of the padded input - int padded_h_start = 0 - paddings[0] / 2; - int padded_w_start = 0 - paddings[1] / 2; - index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2; - index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2; + int paddings_top = paddings[0] / 2; + int paddings_bottom = paddings[0] - paddings_top; + int paddings_left = paddings[1] / 2; + int paddings_right = paddings[1] - paddings_left; + + int padded_h_start = 0 - paddings_top; + int padded_w_start = 0 - paddings_left; + index_t padded_h_stop = input_height + paddings_bottom; + index_t padded_w_stop = input_width + paddings_right; Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); @@ -244,38 +342,59 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { const T *bias_ptr = bias == nullptr ? nullptr : bias->data(); T *output_ptr = output->mutable_data(); + int valid_h_start = + paddings_top == 0 ? 0 : (paddings_top - 1) / stride_h + 1; + int valid_h_stop = paddings_bottom == 0 + ? height + : height - ((paddings_bottom - 1) / stride_h + 1); + int valid_w_start = + paddings_left == 0 ? 0 : (paddings_left - 1) / stride_w + 1; + int valid_w_stop = paddings_right == 0 + ? width + : width - ((paddings_right - 1) / stride_w + 1); + // Calculate border elements with out-of-boundary checking - DepthwiseConv2dKernel( - input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, - channels, input_height, input_width, input_channels, multiplier, - padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, - kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0, 1, 0, width); - DepthwiseConv2dKernel(input_ptr, filter_ptr, bias_ptr, output_ptr, batch, - height, width, channels, input_height, input_width, - input_channels, multiplier, padded_h_start, - padded_h_stop, padded_w_start, padded_w_stop, - kernel_h, kernel_w, stride_h, stride_w, dilation_h, - dilation_w, height - 1, height, 0, width); - DepthwiseConv2dKernel(input_ptr, filter_ptr, bias_ptr, output_ptr, batch, - height, width, channels, input_height, input_width, - input_channels, multiplier, padded_h_start, - padded_h_stop, padded_w_start, padded_w_stop, - kernel_h, kernel_w, stride_h, stride_w, dilation_h, - dilation_w, 1, height - 1, 0, 1); - DepthwiseConv2dKernel(input_ptr, filter_ptr, bias_ptr, output_ptr, batch, - height, width, channels, input_height, input_width, - input_channels, multiplier, padded_h_start, - padded_h_stop, padded_w_start, padded_w_stop, - kernel_h, kernel_w, stride_h, stride_w, dilation_h, - dilation_w, 1, height - 1, width - 1, width); + if (valid_h_start > 0) { + DepthwiseConv2dKernel( + input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, + channels, input_height, input_width, input_channels, multiplier, + padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, + kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0, + valid_h_start, 0, width); + } + if (valid_h_stop < height) { + DepthwiseConv2dKernel( + input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, + channels, input_height, input_width, input_channels, multiplier, + padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, + kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, + std::max(valid_h_start, valid_h_stop), height, 0, width); + } + if (valid_w_start > 0) { + DepthwiseConv2dKernel( + input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, + channels, input_height, input_width, input_channels, multiplier, + padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, + kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, + valid_h_start, valid_h_stop, 0, valid_w_start); + } + if (valid_w_stop < width) { + DepthwiseConv2dKernel( + input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, + channels, input_height, input_width, input_channels, multiplier, + padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, + kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, + valid_h_start, valid_h_stop, std::max(valid_w_start, valid_w_stop), + width); + } // Calculate border elements without out-of-boundary checking DepthwiseConv2dNoOOBCheckKernel( input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width, channels, input_height, input_width, input_channels, multiplier, padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h, - kernel_w, stride_h, stride_w, dilation_h, dilation_w, 1, height - 1, 1, - width - 1); + kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start, + valid_h_stop, valid_w_start, valid_w_stop); output_ptr = output->mutable_data(); DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc similarity index 82% rename from mace/ops/depthwise_conv_2d_benchmark.cc rename to mace/ops/depthwise_conv2d_benchmark.cc index 2971130c3fa9dc92ede22e1fa41dc480d81bbeb8..2f58343ada3665017e197fa171e323348504a706 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -75,25 +75,27 @@ static void DepthwiseConv2d(int iters, } } -#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ - DEVICE) \ - static void \ - BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, OC); \ - } \ - BENCHMARK( \ - BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) +#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ + DEVICE) \ + static void \ + BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ + mace::Padding::P, OC); \ + } \ + BENCHMARK( \ + BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL); +BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); +BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 2ec149e6688edbf2d58bdabf4a36f566c1dc5ecf..c5ff2713d73795421e159c2ad9c7f20e9869d8dc 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -280,10 +280,10 @@ void TestNxNS12(const index_t height, const index_t width) { ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 0.1); }; - for (int kernel_size : {3}) { + for (int kernel_size : {2, 3, 4}) { for (int stride : {1, 2}) { func(kernel_size, kernel_size, stride, stride, VALID); - //func(kernel_size, kernel_size, stride, stride, SAME); + func(kernel_size, kernel_size, stride, stride, SAME); } } }