From e477fde9d55c8c8e6188997f4f2ea67b08e980d4 Mon Sep 17 00:00:00 2001 From: liyin Date: Wed, 20 Mar 2019 16:51:03 +0800 Subject: [PATCH] Support padding for 1x1 --- mace/core/buffer.h | 2 +- mace/ops/arm/activation_neon.cc | 2 +- mace/ops/arm/activation_neon.h | 2 +- mace/ops/arm/common_neon.h | 2 +- mace/ops/arm/conv_2d_neon.h | 109 --- mace/ops/arm/conv_2d_neon_15x1.cc | 161 ---- mace/ops/arm/conv_2d_neon_1x15.cc | 147 ---- mace/ops/arm/conv_2d_neon_1x7.cc | 251 ------ mace/ops/arm/conv_2d_neon_7x1.cc | 291 ------- mace/ops/arm/deconv_2d_neon.h | 2 +- mace/ops/arm/deconv_2d_neon_2x2.cc | 2 +- mace/ops/arm/deconv_2d_neon_3x3.cc | 2 +- mace/ops/arm/deconv_2d_neon_4x4.cc | 2 +- mace/ops/arm/depthwise_conv2d_neon.h | 2 +- mace/ops/arm/depthwise_conv2d_neon_3x3.cc | 2 +- mace/ops/arm/depthwise_deconv2d_neon.h | 2 +- mace/ops/arm/depthwise_deconv2d_neon_3x3.cc | 2 +- mace/ops/arm/depthwise_deconv2d_neon_4x4.cc | 2 +- mace/ops/arm/fp32/conv_2d.cc | 247 ++++++ mace/ops/arm/fp32/conv_2d.h | 46 +- mace/ops/arm/fp32/conv_2d_1x1.cc | 63 +- mace/ops/arm/fp32/conv_2d_1x1.h | 6 +- mace/ops/arm/fp32/conv_2d_1xn.cc | 821 ++++++++++++++++++ mace/ops/arm/fp32/conv_2d_1xn.h | 86 ++ .../conv_2d_3x3.cc} | 172 ++-- mace/ops/arm/fp32/conv_2d_3x3.h | 60 ++ mace/ops/arm/fp32/conv_2d_3x3_winograd.cc | 111 +-- mace/ops/arm/fp32/conv_2d_3x3_winograd.h | 18 +- .../conv_2d_5x5.cc} | 98 ++- mace/ops/arm/fp32/conv_2d_5x5.h | 48 + .../conv_2d_7x7.cc} | 267 +++--- mace/ops/arm/fp32/conv_2d_7x7.h | 73 ++ mace/ops/arm/fp32/conv_general.cc | 232 +++++ mace/ops/arm/fp32/conv_general.h | 50 ++ mace/ops/arm/fp32/gemm.h | 2 +- mace/ops/arm/fp32/gemv.h | 2 +- mace/ops/arm/q8/eltwise.h | 2 +- mace/ops/arm/q8/gemv.h | 2 +- mace/ops/conv_2d.cc | 590 ++----------- mace/ops/ref/conv_2d.cc | 52 +- mace/ops/ref/conv_2d.h | 48 +- .../tools/converter_tool/transformer.py | 3 +- 42 files changed, 2275 insertions(+), 1809 deletions(-) delete mode 100644 mace/ops/arm/conv_2d_neon.h delete mode 100644 mace/ops/arm/conv_2d_neon_15x1.cc delete mode 100644 mace/ops/arm/conv_2d_neon_1x15.cc delete mode 100644 mace/ops/arm/conv_2d_neon_1x7.cc delete mode 100644 mace/ops/arm/conv_2d_neon_7x1.cc create mode 100644 mace/ops/arm/fp32/conv_2d.cc create mode 100644 mace/ops/arm/fp32/conv_2d_1xn.cc create mode 100644 mace/ops/arm/fp32/conv_2d_1xn.h rename mace/ops/arm/{conv_2d_neon_3x3.cc => fp32/conv_2d_3x3.cc} (85%) create mode 100644 mace/ops/arm/fp32/conv_2d_3x3.h rename mace/ops/arm/{conv_2d_neon_5x5.cc => fp32/conv_2d_5x5.cc} (77%) create mode 100644 mace/ops/arm/fp32/conv_2d_5x5.h rename mace/ops/arm/{conv_2d_neon_7x7.cc => fp32/conv_2d_7x7.cc} (78%) create mode 100644 mace/ops/arm/fp32/conv_2d_7x7.h create mode 100644 mace/ops/arm/fp32/conv_general.cc create mode 100644 mace/ops/arm/fp32/conv_general.h diff --git a/mace/core/buffer.h b/mace/core/buffer.h index 418da27d..d1f5f1a5 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -503,7 +503,7 @@ class ScratchBuffer: public Buffer { virtual ~ScratchBuffer() {} MaceStatus GrowSize(const index_t size) { - if (size > size_) { + if (offset_ + size > size_) { VLOG(1) << "Grow scratch size to: " << size; MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size"); return Resize(size); diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc index 6010d714..09cfd8d4 100644 --- a/mace/ops/arm/activation_neon.cc +++ b/mace/ops/arm/activation_neon.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/arm/activation_neon.h index a61b974b..d640e689 100644 --- a/mace/ops/arm/activation_neon.h +++ b/mace/ops/arm/activation_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h index c3451ea0..8d28f558 100644 --- a/mace/ops/arm/common_neon.h +++ b/mace/ops/arm/common_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/conv_2d_neon.h b/mace/ops/arm/conv_2d_neon.h deleted file mode 100644 index b1fbd858..00000000 --- a/mace/ops/arm/conv_2d_neon.h +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_ -#define MACE_OPS_ARM_CONV_2D_NEON_H_ - -#include "mace/core/types.h" - -namespace mace { -namespace ops { - -void Conv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK5x5S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK1x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S3(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK1x15S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK15x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -// calculate one output channel and one input channel -inline void Conv2dCPUKHxKWCalc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t filter_height, - const index_t filter_width, - const index_t out_height, - const index_t out_width, - float *out_ptr, - const int stride) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - for (int i = 0; i < filter_height; ++i) { - for (int j = 0; j < filter_width; ++j) { - out_ptr[h * out_width + w] += - in_ptr[(h * stride + i) * in_width + (w * stride + j)] * - filter_ptr[i * filter_width + j]; - } - } - } - } -} - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_CONV_2D_NEON_H_ diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc deleted file mode 100644 index 5cd58fcc..00000000 --- a/mace/ops/arm/conv_2d_neon_15x1.cc +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" -#include "mace/utils/math.h" - -namespace mace { -namespace ops { - -inline void Conv2dCPUK15x1Calc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t w, - const index_t tile_width, - const index_t out_image_size, - float *out_ptr, - const index_t io, - const int stride) { - for (index_t ih = 0; ih < out_height; ++ih) { - for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { - for (int i = 0; i < 15; ++i) { - for (int j = 0; j < 1; ++j) { - out_ptr[io * out_image_size + ih * out_width + w + iw] += - in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] * - filter_ptr[io * in_channels * 15 + i * 1 + j]; - } - } - } - } -} - -// Ho = 4, Wo = 1, Co = 1 -void Conv2dNeonK15x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - const index_t tile_width = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t w = 0; w < out_shape[3]; w += tile_width) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = output + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 15 + c * 15; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { - // load output - index_t out_offset = h * out_width + w + wt; - // output (1 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo = {out_ptr_base[out_offset], - out_ptr_base[out_offset + out_width], - out_ptr_base[out_offset + 2 * out_width], - out_ptr_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w + wt; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], - in_ptr_base[in_offset + 13 * in_width], - in_ptr_base[in_offset + 14 * in_width], - in_ptr_base[in_offset + 15 * in_width]}; - float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], - in_ptr_base[in_offset + 17 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - float32x4_t vi7 = vextq_f32(vi4, vi8, 3); - float32x4_t vi9 = vextq_f32(vi8, vi12, 1); - float32x4_t vi10 = vextq_f32(vi8, vi12, 2); - float32x4_t vi11 = vextq_f32(vi8, vi12, 3); - float32x4_t vi13 = vextq_f32(vi12, vi16, 1); - float32x4_t vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - out_ptr_base[out_offset] = vo[0]; - out_ptr_base[out_offset + out_width] = vo[1]; - out_ptr_base[out_offset + 2 * out_width] = vo[2]; - out_ptr_base[out_offset + 3 * out_width] = vo[3]; - } // wt - } // h -#else - Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels, - out_height, out_width, w, tile_width, - out_image_size, out_ptr_base, 0, 1); -#endif - } // c - } // w - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc deleted file mode 100644 index b8837490..00000000 --- a/mace/ops/arm/conv_2d_neon_1x15.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" -#include "mace/utils/logging.h" -#include "mace/utils/math.h" - -namespace mace { -namespace ops { - -inline void Conv2dCPUK1x15Calc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t h, - const index_t tile_height, - const index_t out_width, - const index_t out_image_size, - float *out_ptr, - const index_t io, - const int stride) { - for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) { - for (index_t iw = 0; iw < out_width; ++iw) { - for (int i = 0; i < 1; ++i) { - for (int j = 0; j < 15; ++j) { - out_ptr[io * out_image_size + (h + ih) * out_width + iw] += - in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] * - filter_ptr[io * in_channels * 15 + i * 15 + j]; - } - } - } - } -} - -// Ho = 1, Wo = 4, Co = 1 -void Conv2dNeonK1x15S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - const index_t tile_height = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t h = 0; h < out_shape[2]; h += tile_height) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = output + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 15 + c * 15; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo; - // load output - index_t out_offset = (h + ht) * out_width + w; - vo = vld1q_f32(out_ptr_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, - vi10, vi11, vi12, vi13, vi14, vi16; - // input offset - index_t in_offset = (h + ht) * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi12 = vld1q_f32(in_ptr_base + in_offset + 12); - vi16 = vld1q_f32(in_ptr_base + in_offset + 16); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - vi7 = vextq_f32(vi4, vi8, 3); - vi9 = vextq_f32(vi8, vi12, 1); - vi10 = vextq_f32(vi8, vi12, 2); - vi11 = vextq_f32(vi8, vi12, 3); - vi13 = vextq_f32(vi12, vi16, 1); - vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - vst1q_f32(out_ptr_base + out_offset, vo); - } // w - } // ht -#else - Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels, - out_height, h, tile_height, out_width, - out_image_size, out_ptr_base, 0, 1); -#endif - } // c - } // h - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc deleted file mode 100644 index e5e249d3..00000000 --- a/mace/ops/arm/conv_2d_neon_1x7.cc +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" - -namespace mace { -namespace ops { - -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK1x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7; - const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7; - const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - /* outch 0 */ - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); - /* outch 1 */ - vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); - vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); - vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); - vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); - /* outch 2 */ - vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); - vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); - vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); - vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); - /* outch 3 */ - vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); - vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); - vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); - vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - } // w - } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, - in_width, 1, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - /* load filter (1 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#else - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -#endif - - vst1q_f32(out_ptr0_base + out_offset, vo0); - } // w - } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7, - out_height, out_width, out_ptr0_base, 1); -#endif - } // c - } - } // if - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc deleted file mode 100644 index 7aa9309b..00000000 --- a/mace/ops/arm/conv_2d_neon_7x1.cc +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" - -namespace mace { -namespace ops { - -// Ho = 4, Wo = 1, Co = 4 -void Conv2dNeonK7x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7; - const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7; - const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 4 height x 1 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { - // load output - index_t out_offset = h * out_width + w; - // output (4 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; - float32x4_t vo1 = {out_ptr1_base[out_offset], - out_ptr1_base[out_offset + out_width], - out_ptr1_base[out_offset + 2 * out_width], - out_ptr1_base[out_offset + 3 * out_width]}; - float32x4_t vo2 = {out_ptr2_base[out_offset], - out_ptr2_base[out_offset + out_width], - out_ptr2_base[out_offset + 2 * out_width], - out_ptr2_base[out_offset + 3 * out_width]}; - float32x4_t vo3 = {out_ptr3_base[out_offset], - out_ptr3_base[out_offset + out_width], - out_ptr3_base[out_offset + 2 * out_width], - out_ptr3_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - /* outch 0 */ - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); - /* outch 1 */ - vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); - vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); - vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); - vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); - /* outch 2 */ - vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); - vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); - vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); - vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); - /* outch 3 */ - vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); - vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); - vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); - vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; - out_ptr1_base[out_offset] = vo1[0]; - out_ptr1_base[out_offset + out_width] = vo1[1]; - out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; - out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; - out_ptr2_base[out_offset] = vo2[0]; - out_ptr2_base[out_offset + out_width] = vo2[1]; - out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; - out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; - out_ptr3_base[out_offset] = vo3[0]; - out_ptr3_base[out_offset + out_width] = vo3[1]; - out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; - out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; - } // w - } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, - in_width, 7, 1, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf00, vf01; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { - // load output - index_t out_offset = h * out_width + w; - // output (1 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#else - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -#endif - - out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; - } // w - } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1, - out_height, out_width, out_ptr0_base, 1); -#endif - } // c - } - } // if - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h index 62e3e919..f45fa923 100644 --- a/mace/ops/arm/deconv_2d_neon.h +++ b/mace/ops/arm/deconv_2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc index a1bd1883..674864c8 100644 --- a/mace/ops/arm/deconv_2d_neon_2x2.cc +++ b/mace/ops/arm/deconv_2d_neon_2x2.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc index 2859b7c0..04f62325 100644 --- a/mace/ops/arm/deconv_2d_neon_3x3.cc +++ b/mace/ops/arm/deconv_2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc index 690bea4e..443a188f 100644 --- a/mace/ops/arm/deconv_2d_neon_4x4.cc +++ b/mace/ops/arm/deconv_2d_neon_4x4.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h index a4973ed5..b610178c 100644 --- a/mace/ops/arm/depthwise_conv2d_neon.h +++ b/mace/ops/arm/depthwise_conv2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc index 9d6ae41e..ced509e0 100644 --- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h index 70f2bb40..8df6dba1 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon.h +++ b/mace/ops/arm/depthwise_deconv2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc index a666961b..6bba47c2 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc index a7c10b86..677eb152 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc new file mode 100644 index 00000000..799ee521 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d.cc @@ -0,0 +1,247 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/utils/memory.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input, + const Tensor *filter, + const int out_tile_height, + const int out_tile_width, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size) { + in_pad_size->resize(4); + out_pad_size->resize(4); + output_shape->resize(4); + + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + + const index_t stride_h = strides_[0]; + const index_t stride_w = strides_[1]; + const index_t dilation_h = dilations_[0]; + const index_t dilation_w = dilations_[1]; + const index_t filter_h = filter->dim(2); + const index_t filter_w = filter->dim(3); + + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_.data(), + strides_.data(), + padding_type_, + output_shape->data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), + filter->shape().data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + output_shape->data()); + } + const index_t out_height = (*output_shape)[2]; + const index_t out_width = (*output_shape)[3]; + const index_t + padded_out_height = RoundUp(out_height, out_tile_height); + const index_t padded_out_width = RoundUp(out_width, out_tile_width); + const index_t padded_in_height = + std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h + + (filter_h - 1) * dilation_h + 1); + const index_t padded_in_width = + std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w + + (filter_w - 1) * dilation_w + 1); + + (*in_pad_size)[0] = paddings[0] >> 1; + (*in_pad_size)[1] = + static_cast(padded_in_height - in_height - (*in_pad_size)[0]); + (*in_pad_size)[2] = paddings[1] >> 1; + (*in_pad_size)[3] = + static_cast(padded_in_width - in_width - (*in_pad_size)[2]); + + (*out_pad_size)[0] = 0; + (*out_pad_size)[1] = static_cast(padded_out_height - out_height); + (*out_pad_size)[2] = 0; + (*out_pad_size)[3] = static_cast(padded_out_width - out_width); +} + +MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output, + const int out_tile_height, + const int out_tile_width, + std::unique_ptr + *padded_input, + std::unique_ptr + *padded_output) { + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + out_tile_height, + out_tile_width, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + const index_t batch = input->dim(0); + const index_t in_channels = input->dim(1); + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + const index_t out_channels = output->dim(1); + const index_t out_height = output->dim(2); + const index_t out_width = output->dim(3); + + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + const index_t + padded_out_height = out_height + out_pad_size[0] + out_pad_size[1]; + const index_t + padded_out_width = out_width + out_pad_size[2] + out_pad_size[3]; + const bool is_in_padded = + padded_in_height != in_height || padded_in_width != in_width; + const bool is_out_padded = + padded_out_height != out_height || padded_out_width != out_width; + + auto scratch_buffer = context->device()->scratch_buffer(); + const index_t padded_in_size = + MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0); + const index_t padded_out_size = is_out_padded ? PadAlignSize( + sizeof(float) * batch * out_channels * padded_out_height + * padded_out_width) : 0; + + scratch_buffer->Rewind(); + scratch_buffer->GrowSize(padded_in_size + padded_out_size); + if (is_in_padded) { + std::unique_ptr + padded_in = + make_unique(scratch_buffer->Scratch(padded_in_size), + DataType::DT_FLOAT); + padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width}); + PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get()); + *padded_input = std::move(padded_in); + } + if (is_out_padded) { + std::unique_ptr + padded_out = make_unique(scratch_buffer->Scratch(padded_out_size), + DataType::DT_FLOAT); + padded_out->Resize({batch, out_channels, padded_out_height, + padded_out_width}); + *padded_output = std::move(padded_out); + } + return MaceStatus::MACE_SUCCESS; +} + +void Conv2dBase::PadInput(const Tensor &src, + const int pad_top, + const int pad_left, + mace::Tensor *dst) { + if (dst == &src) return; + const index_t batch = src.dim(0); + const index_t channels = src.dim(1); + const index_t height = src.dim(2); + const index_t width = src.dim(3); + const index_t padded_height = dst->dim(2); + const index_t padded_width = dst->dim(3); + const int pad_bottom = static_cast(padded_height - height - pad_top); + const int pad_right = static_cast(padded_width - width - pad_left); + auto in_data = src.data(); + auto padded_in_data = dst->mutable_data(); + + const index_t img_size = height * width; + const index_t padded_img_size = padded_height * padded_width; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t bc = b * channels + c; + const float *in_base = in_data + bc * img_size; + float *padded_in_base = padded_in_data + bc * padded_img_size; + + memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width); + padded_in_base += pad_top * padded_width; + for (index_t h = 0; h < height; ++h) { + memset(padded_in_base, + 0, + sizeof(float) * pad_left); + memcpy(padded_in_base + pad_left, + in_base, + sizeof(float) * width); + memset(padded_in_base + pad_left + width, + 0, + sizeof(float) * pad_right); + in_base += width; + padded_in_base += padded_width; + } + memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width); + } + } +} + +void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) { + if (dst == &src) return; + const index_t batch = dst->dim(0); + const index_t channels = dst->dim(1); + const index_t height = dst->dim(2); + const index_t width = dst->dim(3); + const index_t padded_height = src.dim(2); + const index_t padded_width = src.dim(3); + + auto padded_out_data = src.data(); + auto out_data = dst->mutable_data(); + + const index_t img_size = height * width; + const index_t padded_img_size = padded_height * padded_width; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t bc = (b * channels + c); + float *out_base = out_data + bc * img_size; + const float *padded_out_base = padded_out_data + bc * padded_img_size; + + for (index_t h = 0; h < height; ++h) { + memcpy(out_base, + padded_out_base, + sizeof(float) * width); + out_base += width; + padded_out_base += padded_width; + } // h + } // c + } // b +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h index 7d77cf14..832f6f2f 100644 --- a/mace/ops/arm/fp32/conv_2d.h +++ b/mace/ops/arm/fp32/conv_2d.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,10 +15,14 @@ #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_ #define MACE_OPS_ARM_FP32_CONV_2D_H_ +#include +#include + #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" #include "mace/ops/arm/fp32/gemm.h" +#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -27,13 +31,51 @@ namespace fp32 { class Conv2dBase { public: - Conv2dBase() = default; + Conv2dBase(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} + virtual ~Conv2dBase() = default; + virtual MaceStatus Compute( const OpContext *context, const Tensor *input, const Tensor *filter, Tensor *output) = 0; + + protected: + void CalOutputShapeAndPadSize(const Tensor *input, + const Tensor *filter, + const int out_tile_height, + const int out_tile_width, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size); + + MaceStatus ResizeOutAndPadInOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output, + const int out_tile_height, + const int out_tile_width, + std::unique_ptr *padded_input, + std::unique_ptr *padded_output); + + void PadInput(const Tensor &src, + const int pad_top, + const int pad_left, + Tensor *dst); + void UnPadOutput(const Tensor &src, Tensor *dst); + + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc index b34e19aa..d5e03652 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.cc +++ b/mace/ops/arm/fp32/conv_2d_1x1.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "mace/ops/arm/fp32/conv_2d_1x1.h" namespace mace { @@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, const Tensor *filter, Tensor *output) { index_t batch = input->dim(0); - index_t height = input->dim(2); - index_t width = input->dim(3); + index_t in_height = input->dim(2); + index_t in_width = input->dim(3); index_t in_channels = input->dim(1); - index_t out_channels = filter->dim(0); - MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width})); - context->device()->scratch_buffer()->Rewind(); + + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + 1, + 1, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + const index_t out_channels = output_shape[1]; + const index_t out_height = output_shape[2]; + const index_t out_width = output_shape[3]; + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + + // pad input and transform input + const bool is_in_padded = + in_height != padded_in_height || in_width != padded_in_width; + auto scratch_buffer = context->device()->scratch_buffer(); + const index_t padded_in_size = is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0; + const index_t pack_filter_size = + PadAlignSize(sizeof(float) * out_channels * in_channels); + const index_t pack_input_size = + PadAlignSize( + sizeof(float) * in_channels * padded_in_height * padded_in_width); + const index_t pack_output_size = + PadAlignSize( + sizeof(float) * out_channels * padded_in_height * padded_in_width); + + const index_t gemm_pack_size = + pack_filter_size + pack_input_size + pack_output_size; + + scratch_buffer->Rewind(); + scratch_buffer->GrowSize(padded_in_size + gemm_pack_size); + + const Tensor *padded_in = input; + Tensor tmp_padded_in + (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT); + if (is_in_padded) { + tmp_padded_in.Resize({batch, in_channels, padded_in_height, + padded_in_width}); + PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in); + padded_in = &tmp_padded_in; + } + return gemm_.Compute(context, filter, - input, + padded_in, batch, out_channels, in_channels, in_channels, - height * width, + out_height * out_width, false, false, false, diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h index fd2077ec..68b792fd 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.h +++ b/mace/ops/arm/fp32/conv_2d_1x1.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_ #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_ +#include #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" @@ -28,7 +29,8 @@ namespace fp32 { class Conv2dK1x1 : public Conv2dBase { public: - Conv2dK1x1() : gemm_(true) {} + Conv2dK1x1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK1x1() {} MaceStatus Compute( diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc new file mode 100644 index 00000000..1ff99d80 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_1xn.cc @@ -0,0 +1,821 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "mace/ops/arm/fp32/conv_2d_1xn.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + /* outch 0 */ + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); + /* outch 1 */ + vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); + vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); + vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); + vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); + /* outch 2 */ + vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); + vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); + vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); + vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); + /* outch 3 */ + vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); + vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); + vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); + vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); +#else + /* outch 0 */ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); +#endif + + vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + vst1q_f32(out_ptr0_base + out_offset, vo0); + } // w + } // h + } // c + } + } // if + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 4, + 1, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 4 height x 1 width) */ + float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t w = 0; w < out_width; ++w) { + // load output + index_t out_offset = h * out_width + w; + // output (4 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo0 = {out_ptr0_base[out_offset], + out_ptr0_base[out_offset + out_width], + out_ptr0_base[out_offset + 2 * out_width], + out_ptr0_base[out_offset + 3 * out_width]}; + float32x4_t vo1 = {out_ptr1_base[out_offset], + out_ptr1_base[out_offset + out_width], + out_ptr1_base[out_offset + 2 * out_width], + out_ptr1_base[out_offset + 3 * out_width]}; + float32x4_t vo2 = {out_ptr2_base[out_offset], + out_ptr2_base[out_offset + out_width], + out_ptr2_base[out_offset + 2 * out_width], + out_ptr2_base[out_offset + 3 * out_width]}; + float32x4_t vo3 = {out_ptr3_base[out_offset], + out_ptr3_base[out_offset + out_width], + out_ptr3_base[out_offset + 2 * out_width], + out_ptr3_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + /* outch 0 */ + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); + /* outch 1 */ + vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); + vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); + vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); + vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); + /* outch 2 */ + vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); + vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); + vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); + vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); + /* outch 3 */ + vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); + vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); + vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); + vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); +#else + /* outch 0 */ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); +#endif + + out_ptr0_base[out_offset] = vo0[0]; + out_ptr0_base[out_offset + out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + out_ptr1_base[out_offset] = vo1[0]; + out_ptr1_base[out_offset + out_width] = vo1[1]; + out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; + out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; + out_ptr2_base[out_offset] = vo2[0]; + out_ptr2_base[out_offset + out_width] = vo2[1]; + out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; + out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; + out_ptr3_base[out_offset] = vo3[0]; + out_ptr3_base[out_offset + out_width] = vo3[1]; + out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; + out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t w = 0; w < out_width; ++w) { + // load output + index_t out_offset = h * out_width + w; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo0 = {out_ptr0_base[out_offset], + out_ptr0_base[out_offset + out_width], + out_ptr0_base[out_offset + 2 * out_width], + out_ptr0_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + out_ptr0_base[out_offset] = vo0[0]; + out_ptr0_base[out_offset + out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + } // w + } // h + } // c + } + } // if + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + + +// ==== + +MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_height = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; + +#pragma omp parallel for collapse(3) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t h = 0; h < out_shape[2]; h += tile_height) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo; + // load output + index_t out_offset = (h + ht) * out_width + w; + vo = vld1q_f32(out_ptr_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, + vi10, vi11, vi12, vi13, vi14, vi16; + // input offset + index_t in_offset = (h + ht) * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi12 = vld1q_f32(in_ptr_base + in_offset + 12); + vi16 = vld1q_f32(in_ptr_base + in_offset + 16); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + vi7 = vextq_f32(vi4, vi8, 3); + vi9 = vextq_f32(vi8, vi12, 1); + vi10 = vextq_f32(vi8, vi12, 2); + vi11 = vextq_f32(vi8, vi12, 3); + vi13 = vextq_f32(vi12, vi16, 1); + vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + vst1q_f32(out_ptr_base + out_offset, vo); + } // w + } // ht + } // c + } // h + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 4, + 1, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_width = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; + +#pragma omp parallel for collapse(3) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t w = 0; w < out_shape[3]; w += tile_width) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { + // load output + index_t out_offset = h * out_width + w + wt; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo = {out_ptr_base[out_offset], + out_ptr_base[out_offset + out_width], + out_ptr_base[out_offset + 2 * out_width], + out_ptr_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w + wt; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], + in_ptr_base[in_offset + 13 * in_width], + in_ptr_base[in_offset + 14 * in_width], + in_ptr_base[in_offset + 15 * in_width]}; + float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], + in_ptr_base[in_offset + 17 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + float32x4_t vi7 = vextq_f32(vi4, vi8, 3); + float32x4_t vi9 = vextq_f32(vi8, vi12, 1); + float32x4_t vi10 = vextq_f32(vi8, vi12, 2); + float32x4_t vi11 = vextq_f32(vi8, vi12, 3); + float32x4_t vi13 = vextq_f32(vi12, vi16, 1); + float32x4_t vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + out_ptr_base[out_offset] = vo[0]; + out_ptr_base[out_offset + out_width] = vo[1]; + out_ptr_base[out_offset + 2 * out_width] = vo[2]; + out_ptr_base[out_offset + 3 * out_width] = vo[3]; + } // wt + } // h + } // c + } // w + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h new file mode 100644 index 00000000..a4a5e899 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_1xn.h @@ -0,0 +1,86 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK1x7S1 : public Conv2dBase { + public: + Conv2dK1x7S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK1x7S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x1S1 : public Conv2dBase { + public: + Conv2dK7x1S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x1S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK1x15S1 : public Conv2dBase { + public: + Conv2dK1x15S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK1x15S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK15x1S1 : public Conv2dBase { + public: + Conv2dK15x1S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK15x1S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc similarity index 85% rename from mace/ops/arm/conv_2d_neon_3x3.cc rename to mace/ops/arm/fp32/conv_2d_3x3.cc index 3555b4a5..a8ce5fa6 100644 --- a/mace/ops/arm/conv_2d_neon_3x3.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,22 +12,49 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/utils/macros.h" -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_3x3.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 2, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); -// Ho = 2, Wo = 4, Co = 2 -void Conv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 1 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 1) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr0 = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9; + const float + *in_ptr0 = input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; -#if defined(MACE_ENABLE_NEON) float *out_ptr1 = out_ptr1_base; const float *in_ptr1 = - input + b * in_batch_size + c * in_image_size + 1 * in_width; + input_data + b * in_batch_size + c * in_image_size + 1 * in_width; const float *in_ptr2 = - input + b * in_batch_size + c * in_image_size + 2 * in_width; + input_data + b * in_batch_size + c * in_image_size + 2 * in_width; const float *in_ptr3 = - input + b * in_batch_size + c * in_image_size + 3 * in_width; - const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9; -#endif -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size + 3 * in_width; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; + +#if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; // load filter (2 outch x 3 height x 3 width): vf_outch_height @@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; out_ptr1 += out_width; } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 float *out_ptr0 = out_ptr0_base; // load filter (2 outch x 3 height x 3 width): vf_outch_height @@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; out_ptr1 += out_width; } // h -#else - for (index_t oc = 0; oc < 2; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9, - in_width, 3, 3, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } #endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr0 = - input + b * in_batch_size + c * in_image_size; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; const float *in_ptr1 = - input + b * in_batch_size + c * in_image_size + 1 * in_width; + input_data + b * in_batch_size + c * in_image_size + + 1 * in_width; const float *in_ptr2 = - input + b * in_batch_size + c * in_image_size + 2 * in_width; + input_data + b * in_batch_size + c * in_image_size + + 2 * in_width; const float *in_ptr3 = - input + b * in_batch_size + c * in_image_size + 3 * in_width; -#endif - const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9; + input_data + b * in_batch_size + c * in_image_size + + 3 * in_width; + const float + *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; // load filter (1 outch x 3 height x 3 width): vf_outch_height @@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 float *out_ptr0 = out_ptr0_base; // load filter (1 outch x 3 height x 3 width): vf_outch_height @@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; } // h -#else - Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height, - out_width, out_ptr0_base, 1); #endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -void Conv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input, const index_t in_width = in_shape[3]; const index_t out_height = out_shape[2]; const index_t out_width = out_shape[3]; - const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 9 + c * 9; - float *out_base = output + b * out_batch_size + m * out_image_size; + const float + *in_base = input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(__aarch64__) // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x4_t vf00, vf01, vf02; vf00 = vld1q_f32(filter_ptr); @@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input, vst1q_f32(out_base + out_offset, vo); } // w } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x2_t vf01, vf23, vf45, vf67, vf78; vf01 = vld1_f32(filter_ptr); @@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input, vst1q_f32(out_base + out_offset, vo); } // w } // h -#else - Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height, - out_width, out_base, 2); #endif } // c } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h new file mode 100644 index 00000000..66d47801 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_3x3.h @@ -0,0 +1,60 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK3x3S1 : public Conv2dBase { + public: + Conv2dK3x3S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK3x3S2 : public Conv2dBase { + public: + Conv2dK3x3S2(const std::vector paddings, const Padding padding_type) + : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc index 84f12125..b894a60a 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc @@ -34,13 +34,6 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, const index_t in_width = input->dim(3); const index_t out_channels = filter->dim(0); - index_t padded_in_height = in_height + pad_top_ + pad_bottom_; - index_t padded_in_width = in_width + pad_left_ + pad_right_; - - index_t out_height = padded_in_height - 2; - index_t out_width = padded_in_width - 2; - output->Resize({batch, out_channels, out_height, out_width}); - // When size of input feature map is bigger than 16x16, // set winograd out tile size to 6 to get higher performance. index_t out_tile_size = 2; @@ -48,10 +41,35 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, out_tile_size = 6; } - const index_t padded_out_height = RoundUp(out_height, out_tile_size); - const index_t padded_out_width = RoundUp(out_width, out_tile_size); - padded_in_height = std::max(padded_in_height, padded_out_height + 2); - padded_in_width = std::max(padded_in_width, padded_out_width + 2); + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + out_tile_size, + out_tile_size, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + + const index_t out_height = output_shape[2]; + const index_t out_width = output_shape[3]; + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + const index_t + padded_out_height = out_height + out_pad_size[0] + out_pad_size[1]; + const index_t + padded_out_width = out_width + out_pad_size[2] + out_pad_size[3]; + const int pad_top = in_pad_size[0]; + const int pad_left = in_pad_size[2]; + + bool is_in_padded = + padded_in_height != in_height || padded_in_width != in_width; bool is_out_padded = padded_out_height != out_height || padded_out_width != out_width; @@ -63,8 +81,9 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, // pad input and transform input auto scratch_buffer = context->device()->scratch_buffer(); - const index_t padded_in_size = PadAlignSize( - sizeof(float) * batch * in_channels * padded_in_height * padded_in_width); + const index_t padded_in_size = is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0; const index_t padded_out_size = is_out_padded ? PadAlignSize( sizeof(float) * batch * out_channels * padded_out_height * padded_out_width) : 0; @@ -81,8 +100,18 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, scratch_buffer->GrowSize( padded_in_size + padded_out_size + transformed_in_size + transformed_out_size + gemm_pack_size); - Tensor padded_in(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT); - padded_in.Resize({batch, in_channels, padded_in_height, padded_in_width}); + + const Tensor *padded_in = input; + Tensor tmp_padded_in + (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT); + if (is_in_padded) { + tmp_padded_in.Resize({batch, in_channels, padded_in_height, + padded_in_width}); + Tensor::MappingGuard guard(&tmp_padded_in); + PadInput(*input, pad_top, pad_left, &tmp_padded_in); + padded_in = &tmp_padded_in; + } + Tensor *padded_out = output; Tensor tmp_padded_out (scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT); @@ -94,22 +123,10 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, auto transformed_in = scratch_buffer->Scratch(transformed_in_size); auto transformed_out = scratch_buffer->Scratch(transformed_out_size); - - auto padded_in_data = padded_in.data(); + auto padded_in_data = padded_in->data(); auto padded_out_data = padded_out->mutable_data(); auto transformed_in_data = transformed_in.mutable_data(); auto transformed_out_data = transformed_out.mutable_data(); - - const index_t padded_bottom = padded_in_height - in_height - pad_top_; - const index_t padded_right = padded_in_width - in_width - pad_left_; - ConstructNCHWInputWithSpecificPadding(input, - pad_top_, - padded_bottom, - pad_left_, - padded_right, - &padded_in); - - Tensor::MappingGuard filter_guard(filter); auto filter_data = filter->data(); if (!filter->is_weight() || out_tile_size != out_tile_size_) { @@ -215,47 +232,11 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, default:MACE_NOT_IMPLEMENTED; } - if (is_out_padded) { - UnPackOutput(*padded_out, output); - } + UnPadOutput(*padded_out, output); return MaceStatus::MACE_SUCCESS; } -void Conv2dK3x3Winograd::UnPackOutput(const Tensor &src, Tensor *dst) { - const index_t batch = dst->dim(0); - const index_t channels = dst->dim(1); - const index_t height = dst->dim(2); - const index_t width = dst->dim(3); - const index_t padded_height = src.dim(2); - const index_t padded_width = src.dim(3); - - if (height == padded_height && width == padded_width) { - return; - } - - auto padded_out_data = src.data(); - auto out_data = dst->mutable_data(); - - const index_t img_size = height * width; - const index_t padded_img_size = padded_height * padded_width; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t h = 0; h < height; ++h) { - memcpy( - out_data + (b * channels + c) * img_size - + h * width, - padded_out_data - + (b * channels + c) * padded_img_size - + h * padded_width, - sizeof(float) * width); - } // h - } // c - } // b -} - // OCHW => TOC void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter, const index_t in_channels, diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h index bd5d3e0c..3ed8646b 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h @@ -15,6 +15,7 @@ #ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_ #define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_ +#include #include #include "mace/public/mace.h" @@ -30,12 +31,10 @@ namespace fp32 { class Conv2dK3x3Winograd : public Conv2dBase { public: - Conv2dK3x3Winograd(int pad_top, int pad_bottom, int pad_left, int pad_right) - : gemm_(), - pad_top_(pad_top), - pad_bottom_(pad_bottom), - pad_left_(pad_left), - pad_right_(pad_right), + Conv2dK3x3Winograd(const std::vector paddings, + const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type), + gemm_(), transformed_filter_(nullptr), out_tile_size_(0) {} @@ -48,9 +47,6 @@ class Conv2dK3x3Winograd : public Conv2dBase { Tensor *output); private: - void UnPackOutput(const Tensor &padded_output, - Tensor *output); - void TransformFilter4x4(const float *filter, const index_t in_channels, const index_t out_channels, @@ -94,10 +90,6 @@ class Conv2dK3x3Winograd : public Conv2dBase { float *output); Gemm gemm_; - int pad_top_; - int pad_bottom_; - int pad_left_; - int pad_right_; std::unique_ptr transformed_filter_; index_t out_tile_size_; }; diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc similarity index 77% rename from mace/ops/arm/conv_2d_neon_5x5.cc rename to mace/ops/arm/fp32/conv_2d_5x5.cc index 81d89297..264e48fa 100644 --- a/mace/ops/arm/conv_2d_neon_5x5.cc +++ b/mace/ops/arm/fp32/conv_2d_5x5.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_5x5.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { #define MACE_Conv2dNeonK5x5SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -76,12 +76,40 @@ namespace ops { vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK5x5S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 25 + c * 25; + filter_data + (m + 1) * in_channels * 25 + c * 25; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 25 + c * 25; + filter_data + (m + 2) * in_channels * 25 + c * 25; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 25 + c * 25; + filter_data + (m + 3) * in_channels * 25 + c * 25; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input, filter_ptr3 -= 25; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25, - in_width, 5, 5, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input, filter_ptr0 -= 25; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5, - out_height, out_width, out_ptr0_base, 1); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h new file mode 100644 index 00000000..154d74a8 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_5x5.h @@ -0,0 +1,48 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK5x5S1 : public Conv2dBase { + public: + Conv2dK5x5S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK5x5S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc similarity index 78% rename from mace/ops/arm/conv_2d_neon_7x7.cc rename to mace/ops/arm/fp32/conv_2d_7x7.cc index 2411aad6..86d3e468 100644 --- a/mace/ops/arm/conv_2d_neon_7x7.cc +++ b/mace/ops/arm/fp32/conv_2d_7x7.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_7x7.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -153,12 +153,40 @@ namespace ops { vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 1); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 2); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 2); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S3(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 3); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 3); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h new file mode 100644 index 00000000..e64780ba --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_7x7.h @@ -0,0 +1,73 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK7x7S1 : public Conv2dBase { + public: + Conv2dK7x7S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x7S2 : public Conv2dBase { + public: + Conv2dK7x7S2(const std::vector paddings, const Padding padding_type) + : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x7S3 : public Conv2dBase { + public: + Conv2dK7x7S3(const std::vector paddings, const Padding padding_type) + : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S3() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc new file mode 100644 index 00000000..a12c5d53 --- /dev/null +++ b/mace/ops/arm/fp32/conv_general.cc @@ -0,0 +1,232 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "mace/ops/arm/fp32/conv_general.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + auto filter_shape = filter->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = filter_shape[1] * in_image_size; + const index_t out_batch_size = filter_shape[0] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < filter_shape[0]; m += 4) { + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + + const int stride_h = strides_[0]; + const int stride_w = strides_[1]; + const int dilation_h = dilations_[0]; + const int dilation_w = dilations_[1]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter_data + m * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (4 outch x 1 height x 4 width): vo_outch_height + float vo0[4], vo1[4], vo2[4], vo3[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + vo1[ow] = out_ptr1_base[out_offset + ow]; + vo2[ow] = out_ptr2_base[out_offset + ow]; + vo3[ow] = out_ptr3_base[out_offset + ow]; + } + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + // outch 1 + vo1[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr1[kw]; + vo1[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + // outch 2 + vo2[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr2[kw]; + vo2[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + // outch 3 + vo3[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr3[kw]; + vo3[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + filter_ptr1 += filter_shape[3]; + filter_ptr2 += filter_shape[3]; + filter_ptr3 += filter_shape[3]; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + out_ptr1_base[out_offset + ow] = vo1[ow]; + out_ptr2_base[out_offset + ow] = vo2[ow]; + out_ptr3_base[out_offset + ow] = vo3[ow]; + } + + filter_ptr0 -= filter_size; + filter_ptr1 -= filter_size; + filter_ptr2 -= filter_size; + filter_ptr3 -= filter_size; + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter_data + mm * in_channels * filter_size + c * filter_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (1 outch x 1 height x 4 width): vo_outch_height + float vo0[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + } + + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + } + filter_ptr0 -= filter_size; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h new file mode 100644 index 00000000..01d01954 --- /dev/null +++ b/mace/ops/arm/fp32/conv_general.h @@ -0,0 +1,50 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_ +#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dGeneral : public Conv2dBase { + public: + Conv2dGeneral(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : Conv2dBase(strides, dilations, paddings, padding_type) {} + virtual ~Conv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_GENERAL_H_ diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h index 36eb0378..ce226c1a 100644 --- a/mace/ops/arm/fp32/gemm.h +++ b/mace/ops/arm/fp32/gemm.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/fp32/gemv.h index 3210def1..1f406426 100644 --- a/mace/ops/arm/fp32/gemv.h +++ b/mace/ops/arm/fp32/gemv.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h index 5223dc30..200b13cb 100644 --- a/mace/ops/arm/q8/eltwise.h +++ b/mace/ops/arm/q8/eltwise.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/q8/gemv.h b/mace/ops/arm/q8/gemv.h index 1734a956..21a27579 100644 --- a/mace/ops/arm/q8/gemv.h +++ b/mace/ops/arm/q8/gemv.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 65a723ad..a6421f45 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -27,7 +27,6 @@ #include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/ops/activation.h" -#include "mace/ops/arm/conv_2d_neon.h" #include "mace/ops/conv_pool_2d_base.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/utils/memory.h" @@ -36,11 +35,16 @@ #ifdef MACE_ENABLE_NEON #include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/fp32/conv_2d_1x1.h" +#include "mace/ops/arm/fp32/conv_2d_3x3.h" #include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h" -#else -#include "mace/ops/ref/conv_2d.h" +#include "mace/ops/arm/fp32/conv_2d_5x5.h" +#include "mace/ops/arm/fp32/conv_2d_7x7.h" +#include "mace/ops/arm/fp32/conv_2d_1xn.h" +#include "mace/ops/arm/fp32/conv_general.h" #endif // MACE_ENABLE_NEON +#include "mace/ops/ref/conv_2d.h" + #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/common/gemmlowp_util.h" #include "mace/ops/quantization_util.h" @@ -68,8 +72,7 @@ class Conv2dOp : public ConvPool2dOpBase { "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)), - conv2d_delegator_(nullptr) {} + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); @@ -77,342 +80,99 @@ class Conv2dOp : public ConvPool2dOpBase { const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); - index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - std::vector filter_shape(4); - filter_shape = filter->shape(); - - index_t stride_h = strides_[0]; - index_t stride_w = strides_[1]; - - index_t dilation_h = dilations_[0]; - index_t dilation_w = dilations_[1]; - - std::vector output_shape(4); - std::vector paddings(2); - if (paddings_.empty()) { - CalcNCHWPaddingAndOutputSize(input->shape().data(), - filter_shape.data(), - dilations_.data(), - strides_.data(), - padding_type_, - output_shape.data(), - paddings.data()); - } else { - paddings = paddings_; - CalcNCHWOutputSize(input->shape().data(), - filter_shape.data(), - paddings_.data(), - dilations_.data(), - strides_.data(), - RoundType::FLOOR, - output_shape.data()); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); - - MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); - MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels); - MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ", - input_channels); + const index_t channels = filter->dim(0); #ifdef MACE_ENABLE_NEON - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); - index_t filter_h = filter->dim(2); - index_t filter_w = filter->dim(3); - - int pad_top = paddings[0] >> 1; - int pad_bottom = paddings[0] - pad_top; - int pad_left = paddings[1] >> 1; - int pad_right = paddings[1] - pad_left; - - if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { - if (conv2d_delegator_.get() == nullptr) { - conv2d_delegator_ = make_unique(); - } - conv2d_delegator_->Compute(context, input, filter, output); - } else if (filter_h == 3 && filter_w == 3 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1 - && input_channels >= 8 && channels >= 8) { - if (conv2d_delegator_.get() == nullptr) { - conv2d_delegator_ = make_unique( - pad_top, pad_bottom, pad_left, pad_right); - } - conv2d_delegator_->Compute(context, input, filter, output); - } else { - // TODO(liyin): the code below needs to be refactored. - // delegate to each of kernels instead of ruling them all - index_t padded_input_height = input_height + paddings[0]; - index_t padded_input_width = input_width + paddings[1]; - index_t extra_input_height = padded_input_height; - index_t extra_input_width = padded_input_width; - index_t extra_output_height = height; - index_t extra_output_width = width; - - int pad_top = paddings[0] >> 1; - int pad_bottom = paddings[0] - pad_top; - int pad_left = paddings[1] >> 1; - int pad_right = paddings[1] - pad_left; - - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard output_guard(output); - - auto filter_data = filter->data(); - auto output_data = output->mutable_data(); - - std::function conv_func; - - bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3 + // the following params are used to decide which conv delegator to use + const index_t stride_h = strides_[0]; + const index_t stride_w = strides_[1]; + const index_t dilation_h = dilations_[0]; + const index_t dilation_w = dilations_[1]; + const index_t filter_h = filter->dim(2); + const index_t filter_w = filter->dim(3); + const index_t input_channels = input->dim(1); + + // NOTE: delegator is fixed after first round of running, + // although winograd depends on input params. + // We do not support changeable filter for now. + if (conv2d_delegator_.get() == nullptr) { + if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3 + && dilation_w == 1) { + if (input_channels >= 8 && channels >= 8) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } + } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 5 && filter_w == 5 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x1_s1 = filter_h == 7 && filter_w == 1 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 2 && stride_w == 2 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 3 && stride_w == 3 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 1 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - - index_t tile_h, tile_w; - if (use_neon_3x3_s1) { - tile_h = 2; - tile_w = 4; - } else if (use_neon_7x1_s1 || use_neon_15x1_s1) { - tile_h = 4; - tile_w = 1; - } else { - tile_h = 1; - tile_w = 4; - } - extra_output_height = RoundUp(height, tile_h); - extra_input_height = - std::max(padded_input_height, (extra_output_height - 1) * stride_h - + (filter_h - 1) * dilation_h + 1); - extra_output_width = RoundUp(width, tile_w); - extra_input_width = - std::max(padded_input_width, (extra_output_width - 1) * stride_w - + (filter_w - 1) * dilation_w + 1); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); - } - - // decide scratch size before allocate it - index_t total_scratch_size = 0; - index_t padded_input_size = 0; - index_t padded_output_size = 0; - - if (extra_input_height != input_height - || extra_input_width != input_width) { - padded_input_size = - PadAlignSize( - batch * input_channels * (input_height + pad_top + pad_bottom) - * (input_width + pad_left + pad_right) * sizeof(float) + - MACE_EXTRA_BUFFER_PAD_SIZE); - total_scratch_size += padded_input_size; - } - if (extra_output_height != height || extra_output_width != width) { - padded_output_size = - PadAlignSize( - batch * channels * extra_output_height * extra_output_width - * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE); - total_scratch_size += padded_output_size; - } - - // Init scratch buffer - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(total_scratch_size); - Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT); - Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT); - const index_t extra_input_shape[4] = - {batch, input_channels, extra_input_height, extra_input_width}; - const index_t extra_output_shape[4] = - {batch, channels, extra_output_height, extra_output_width}; - - // make host compiler happy - MACE_UNUSED(extra_input_shape); - MACE_UNUSED(extra_output_shape); - - // decide which convolution function to call - if (use_neon_3x3_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK3x3S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_3x3_s2) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK3x3S2(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_5x5_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK5x5S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_1x7_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK1x7S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x1_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x1S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s2) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S2(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s3) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S3(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_1x15_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK1x15S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_15x1_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK15x1S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dGeneral(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - filter_shape.data(), - strides_.data(), - dilations_.data(), - pad_output); - }; - } - - // pad input and output - const Tensor *pad_input_ptr = input; - if (extra_input_height != input_height - || extra_input_width != input_width) { - MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding( - input, pad_top, pad_bottom, pad_left, pad_right, &padded_input)); - pad_input_ptr = &padded_input; - } - - // TODO(libin): don't need clear after bias is integrated in each conv - Tensor *pad_output_ptr = output; - if (extra_output_height != height || extra_output_width != width) { - padded_output.Reshape({batch, channels, extra_output_height, - extra_output_width}); - padded_output.Clear(); - pad_output_ptr = &padded_output; + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 1 && filter_w == 15 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 15 && filter_w == 1 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); } else { - output->Clear(); - } - - const float *pad_input_data = pad_input_ptr->data(); - float *pad_output_data = pad_output_ptr->mutable_data(); - - conv_func(pad_input_data, pad_output_data); - - // unpack output - if (extra_output_height != height || extra_output_width != width) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t h = 0; h < height; ++h) { - memcpy( - output_data + b * channels * height * width - + c * height * width - + h * width, - pad_output_data - + b * channels * extra_output_height * extra_output_width - + c * extra_output_height * extra_output_width - + h * extra_output_width, - sizeof(float) * width); - } - } - } + conv2d_delegator_ = make_unique( + strides_, + dilations_, + paddings_, + padding_type_); } } + + conv2d_delegator_->Compute(context, input, filter, output); #else - if (conv2d_delegator_.get() == nullptr) { - conv2d_delegator_ = make_unique>(paddings[0], - paddings[1], - stride_h, - stride_w, - dilation_h, - dilation_w); + if (ref_conv2d_delegator_.get() == nullptr) { + ref_conv2d_delegator_ = make_unique>(strides_, + dilations_, + paddings_, + padding_type_); } - conv2d_delegator_->Compute(context, input, filter, output); + ref_conv2d_delegator_->Compute(context, input, filter, output); #endif Tensor::MappingGuard bias_guard(bias); @@ -420,6 +180,9 @@ class Conv2dOp : public ConvPool2dOpBase { auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); if (bias_data != nullptr) { + const index_t batch = input->dim(0); + const index_t height = output->dim(2); + const index_t width = output->dim(3); const index_t image_size = height * width; #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { @@ -452,186 +215,13 @@ class Conv2dOp : public ConvPool2dOpBase { } private: - void Conv2dGeneral(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const index_t *filter_shape, - const int *stride_hw, - const int *dilation_hw, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = filter_shape[1] * in_image_size; - const index_t out_batch_size = filter_shape[0] * out_image_size; - const index_t filter_size = filter_shape[2] * filter_shape[3]; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < in_shape[0]; b++) { - for (index_t m = 0; m < filter_shape[0]; m += 4) { - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - - const int stride_h = stride_hw[0]; - const int stride_w = stride_hw[1]; - const int dilation_h = dilation_hw[0]; - const int dilation_w = dilation_hw[1]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = out_ptr0_base + out_image_size; - float *out_ptr2_base = out_ptr1_base + out_image_size; - float *out_ptr3_base = out_ptr2_base + out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; - const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; - const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (4 outch x 1 height x 4 width): vo_outch_height - float vo0[4], vo1[4], vo2[4], vo3[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - vo1[ow] = out_ptr1_base[out_offset + ow]; - vo2[ow] = out_ptr2_base[out_offset + ow]; - vo3[ow] = out_ptr3_base[out_offset + ow]; - } - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - // outch 1 - vo1[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr1[kw]; - vo1[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - // outch 2 - vo2[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr2[kw]; - vo2[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - // outch 3 - vo3[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr3[kw]; - vo3[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - filter_ptr1 += filter_shape[3]; - filter_ptr2 += filter_shape[3]; - filter_ptr3 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - out_ptr1_base[out_offset + ow] = vo1[ow]; - out_ptr2_base[out_offset + ow] = vo2[ow]; - out_ptr3_base[out_offset + ow] = vo3[ow]; - } - - filter_ptr0 -= filter_size; - filter_ptr1 -= filter_size; - filter_ptr2 -= filter_size; - filter_ptr3 -= filter_size; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + mm * in_channels * filter_size + c * filter_size; - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (1 outch x 1 height x 4 width): vo_outch_height - float vo0[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - } - - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - } - filter_ptr0 -= filter_size; - } // w - } // h - } // c - } // mm - } // if - } // m - } // b - } const ActivationType activation_; const float relux_max_limit_; const float leakyrelu_coefficient_; #ifdef MACE_ENABLE_NEON std::unique_ptr conv2d_delegator_; #else - std::unique_ptr> conv2d_delegator_; + std::unique_ptr> ref_conv2d_delegator_; #endif // MACE_ENABLE_NEON private: diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index 4707d922..e5b7952a 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -16,7 +16,6 @@ #include "mace/ops/ref/conv_2d.h" #include -#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -30,31 +29,36 @@ MaceStatus Conv2d::Compute(const OpContext *context, const std::vector in_shape = input->shape(); const std::vector filter_shape = filter->shape(); - const std::vector out_shape = output->shape(); - const std::vector stride_hw{stride_h_, stride_w_}; - const std::vector dilation_hw{dilation_h_, dilation_w_}; - const std::vector paddings{pad_h_, pad_w_}; - const index_t pad_top = pad_h_ >> 1; - const index_t pad_left = pad_w_ >> 1; - - std::vector output_shape(4); - - CalcOutputSize(in_shape.data(), - NCHW, - filter_shape.data(), - OIHW, - paddings.data(), - dilation_hw.data(), - stride_hw.data(), - RoundType::FLOOR, - output_shape.data()); - output->Resize(output_shape); - + std::vector out_shape(4); + + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_.data(), + strides_.data(), + padding_type_, + out_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), + filter->shape().data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + out_shape.data()); + } + const index_t pad_top = paddings[0] >> 1; + const index_t pad_left = paddings[1] >> 1; + output->Resize(out_shape); const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = filter_shape[1] * in_image_size; const index_t out_batch_size = filter_shape[0] * out_image_size; const index_t filter_size = filter_shape[2] * filter_shape[3]; + Tensor::MappingGuard input_guard(input); Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard output_guard(output); @@ -86,8 +90,10 @@ MaceStatus Conv2d::Compute(const OpContext *context, for (index_t kh = 0; kh < filter_shape[2]; ++kh) { for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_; - const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_; + const index_t + ih = -pad_top + h * strides_[0] + kh * dilations_[0]; + const index_t + iw = -pad_left + w * strides_[1] + kw * dilations_[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw]; } diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h index e99af5cf..10baac8c 100644 --- a/mace/ops/ref/conv_2d.h +++ b/mace/ops/ref/conv_2d.h @@ -16,9 +16,12 @@ #ifndef MACE_OPS_REF_CONV_2D_H_ #define MACE_OPS_REF_CONV_2D_H_ +#include + #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" +#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -27,30 +30,39 @@ namespace ref { template class Conv2d { public: - Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w); + Conv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} ~Conv2d() {} MaceStatus Compute( const OpContext *context, const Tensor *input, const Tensor *filter, Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; template<> class Conv2d { public: - Conv2d(int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w) - : pad_h_(pad_h), - pad_w_(pad_w), - stride_h_(stride_h), - stride_w_(stride_w), - dilation_h_(dilation_h), - dilation_w_(dilation_w) {} + Conv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} ~Conv2d() {} // Always row-major after transpose MaceStatus Compute( @@ -60,12 +72,10 @@ class Conv2d { Tensor *output); private: - int pad_h_; - int pad_w_; - int stride_h_; - int stride_w_; - int dilation_h_; - int dilation_w_; + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; } // namespace ref diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 256a397c..4cee2aab 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1954,7 +1954,8 @@ class Transformer(base_converter.ConverterInterface): else: print("Quantize op %s (%s)" % (op.name, op.type)) - non_zero = self._option.device == DeviceType.CPU.value + non_zero = self._option.device == DeviceType.CPU.value \ + and op.type == MaceOp.MatMul.name for idx, input_tensor in enumerate(op.input): quantized_inputs_names.append(input_tensor) -- GitLab