From 60695ebdef71b92ae39deff2a1823fc5b77c9f71 Mon Sep 17 00:00:00 2001 From: Bin Li Date: Mon, 21 May 2018 17:22:14 +0800 Subject: [PATCH] Optimize conv1x15 conv15x1 --- mace/kernels/arm/conv_2d_neon.h | 12 ++ mace/kernels/arm/conv_2d_neon_15x1.cc | 163 ++++++++++++++++++++++++++ mace/kernels/arm/conv_2d_neon_1x15.cc | 149 +++++++++++++++++++++++ mace/kernels/conv_2d.h | 50 +++++--- mace/ops/conv_2d_benchmark.cc | 6 +- mace/ops/conv_2d_test.cc | 6 + 6 files changed, 371 insertions(+), 15 deletions(-) create mode 100644 mace/kernels/arm/conv_2d_neon_15x1.cc create mode 100644 mace/kernels/arm/conv_2d_neon_1x15.cc diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h index 5d2d5f9a..b35429ba 100644 --- a/mace/kernels/arm/conv_2d_neon.h +++ b/mace/kernels/arm/conv_2d_neon.h @@ -65,6 +65,18 @@ extern void Conv2dNeonK7x7S3(const float *input, const index_t *out_shape, float *output); +extern void Conv2dNeonK1x15S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +extern void Conv2dNeonK15x1S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + } // namespace kernels } // namespace mace diff --git a/mace/kernels/arm/conv_2d_neon_15x1.cc b/mace/kernels/arm/conv_2d_neon_15x1.cc new file mode 100644 index 00000000..80dda314 --- /dev/null +++ b/mace/kernels/arm/conv_2d_neon_15x1.cc @@ -0,0 +1,163 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include "mace/kernels/arm/conv_2d_neon.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +inline void Conv2dCPUK15x1Calc(const float *in_ptr, + const float *filter_ptr, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t w, + const index_t tile_width, + const index_t out_image_size, + float *out_ptr, + const index_t io, + const int stride) { + for (index_t ih = 0; ih < out_height; ++ih) { + for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { + for (int i = 0; i < 15; ++i) { + for (int j = 0; j < 1; ++j) { + out_ptr[io * out_image_size + ih * out_width + w + iw] + += in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] + * filter_ptr[io * in_channels * 15 + i * 1 + j]; + } + } + } + } +} + + +// Ho = 4, Wo = 1, Co = 1 +void Conv2dNeonK15x1S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_width = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; + +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t w = 0; w < out_shape[3]; w += tile_width) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter + m * in_channels * 15 + c * 15; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + /* load filter (1 outch x 1 height x 4 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { + // load output + index_t out_offset = h * out_width + w + wt; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo = {out_ptr_base[out_offset], + out_ptr_base[out_offset + out_width], + out_ptr_base[out_offset + 2 * out_width], + out_ptr_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w + wt; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], + in_ptr_base[in_offset + 13 * in_width], + in_ptr_base[in_offset + 14 * in_width], + in_ptr_base[in_offset + 15 * in_width]}; + float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], + in_ptr_base[in_offset + 17 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + float32x4_t vi7 = vextq_f32(vi4, vi8, 3); + float32x4_t vi9 = vextq_f32(vi8, vi12, 1); + float32x4_t vi10 = vextq_f32(vi8, vi12, 2); + float32x4_t vi11 = vextq_f32(vi8, vi12, 3); + float32x4_t vi13 = vextq_f32(vi12, vi16, 1); + float32x4_t vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + out_ptr_base[out_offset] = vo[0]; + out_ptr_base[out_offset + out_width] = vo[1]; + out_ptr_base[out_offset + 2 * out_width] = vo[2]; + out_ptr_base[out_offset + 3 * out_width] = vo[3]; + } // wt + } // h +#else + Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels, + out_height, out_width, w, tile_width, + out_image_size, out_ptr_base, 0, 1); +#endif + } // c + } // w + } // m + } // b +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_2d_neon_1x15.cc b/mace/kernels/arm/conv_2d_neon_1x15.cc new file mode 100644 index 00000000..0dd39fba --- /dev/null +++ b/mace/kernels/arm/conv_2d_neon_1x15.cc @@ -0,0 +1,149 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include "mace/kernels/arm/conv_2d_neon.h" +#include "mace/utils/utils.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace kernels { + +inline void Conv2dCPUK1x15Calc(const float *in_ptr, + const float *filter_ptr, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t h, + const index_t tile_height, + const index_t out_width, + const index_t out_image_size, + float *out_ptr, + const index_t io, + const int stride) { + for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 15; ++j) { + out_ptr[io * out_image_size + (h + ih) * out_width + iw] + += in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] + * filter_ptr[io * in_channels * 15 + i * 15 + j]; + } + } + } + } +} + + +// Ho = 1, Wo = 4, Co = 1 +void Conv2dNeonK1x15S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_height = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; + +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t h = 0; h < out_shape[2]; h += tile_height) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter + m * in_channels * 15 + c * 15; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo; + // load output + index_t out_offset = (h + ht) * out_width + w; + vo = vld1q_f32(out_ptr_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, + vi10, vi11, vi12, vi13, vi14, vi16; + // input offset + index_t in_offset = (h + ht) * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi12 = vld1q_f32(in_ptr_base + in_offset + 12); + vi16 = vld1q_f32(in_ptr_base + in_offset + 16); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + vi7 = vextq_f32(vi4, vi8, 3); + vi9 = vextq_f32(vi8, vi12, 1); + vi10 = vextq_f32(vi8, vi12, 2); + vi11 = vextq_f32(vi8, vi12, 3); + vi13 = vextq_f32(vi12, vi16, 1); + vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + vst1q_f32(out_ptr_base + out_offset, vo); + } // w + } // ht +#else + Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels, + out_height, h, tile_height, out_width, + out_image_size, out_ptr_base, 0, 1); +#endif + } // c + } // h + } // m + } // b +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index eb374960..7a0b8328 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -363,6 +363,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7 && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1; + bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; std::vector transformed_input_shape; std::vector transformed_output_shape; @@ -402,24 +406,26 @@ struct Conv2dFunctor : Conv2dFunctorBase { tile_count}); transformed_filter_shape.insert(transformed_filter_shape.end(), {in_tile_area, channels, input_channels}); - } else if (use_neon_3x3_s1) { - extra_output_height = RoundUp(height, 2); - extra_input_height = - std::max(padded_input_height, extra_output_height + 2); - extra_output_width = RoundUp(width, 4); - extra_input_width = std::max(padded_input_width, extra_output_width + 2); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); + } else { + index_t tile_h, tile_w; + if (use_neon_1x1_s1) { + tile_h = 1; + tile_w = 1; + } else if (use_neon_3x3_s1) { + tile_h = 2; + tile_w = 4; + } else if (use_neon_15x1_s1) { + tile_h = 4; + tile_w = 1; + } else { + tile_h = 1; + tile_w = 4; } - } else if (!use_neon_1x1_s1) { - extra_output_height = height; + extra_output_height = RoundUp(height, tile_h); extra_input_height = std::max(padded_input_height, (extra_output_height - 1) * stride_h + (filter_h - 1) * dilation_h + 1); - extra_output_width = RoundUp(width, 4); + extra_output_width = RoundUp(width, tile_w); extra_input_width = std::max(padded_input_width, (extra_output_width - 1) * stride_w + (filter_w - 1) * dilation_w + 1); @@ -584,6 +590,22 @@ struct Conv2dFunctor : Conv2dFunctorBase { extra_output_shape, pad_output); }; + } else if (use_neon_1x15_s1) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK1x15S1(pad_input, + filter_data, + extra_input_shape, + extra_output_shape, + pad_output); + }; + } else if (use_neon_15x1_s1) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK15x1S1(pad_input, + filter_data, + extra_input_shape, + extra_output_shape, + pad_output); + }; } else { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dGeneral(pad_input, diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index b795e127..d935c503 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -165,10 +165,14 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32); BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128); BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024); - BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32); BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32); +// bokeh +BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); +BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); +BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index ea50b0c1..543e2ac9 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -779,11 +779,17 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { TestHalfComplexConvNxNS12({32, 32}, {15, 1, 256, 2}, {1, 1}); + TestHalfComplexConvNxNS12({64, 64}, {15, 1, 64, 2}, + {1, 1}); + TestHalfComplexConvNxNS12({256, 256}, {15, 1, 32, 2}, + {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { TestHalfComplexConvNxNS12({32, 32}, {1, 15, 256, 2}, {1, 1}); + TestHalfComplexConvNxNS12({256, 256}, {1, 15, 32, 2}, + {1, 1}); } TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { -- GitLab