From afa836d9a98ba1a08525d9468ddde11671df6e31 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Sun, 9 Dec 2018 19:18:01 +0800 Subject: [PATCH] Refactor pooling implementation --- src/common/types.h | 5 + .../kernel/central-arm-func/pool_arm_func.h | 102 +- src/operators/math/pool_2x2.cpp | 304 ------ src/operators/math/pool_2x2.h | 37 - src/operators/math/pool_3x3.cpp | 904 ------------------ src/operators/math/pool_3x3.h | 50 - src/operators/math/pool_3x3_int8.cpp | 564 ----------- src/operators/math/pooling.cpp | 119 +-- src/operators/math/pooling.h | 174 +++- src/operators/math/pooling3x3.cpp | 819 ++++++++++++++++ test/operators/test_pool_op.cpp | 176 +--- 11 files changed, 1043 insertions(+), 2211 deletions(-) delete mode 100644 src/operators/math/pool_2x2.cpp delete mode 100644 src/operators/math/pool_2x2.h delete mode 100644 src/operators/math/pool_3x3.cpp delete mode 100644 src/operators/math/pool_3x3.h delete mode 100644 src/operators/math/pool_3x3_int8.cpp create mode 100644 src/operators/math/pooling3x3.cpp diff --git a/src/common/types.h b/src/common/types.h index 6813f0ce74..c607efb9a2 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -102,6 +102,11 @@ enum ActivationType { Sigmoid = 6, }; +enum PoolingType { + Max = 0, + Avg = 1, +}; + extern const char *G_OP_TYPE_CONV; extern const char *G_OP_TYPE_BATCHNORM; extern const char *G_OP_TYPE_BOX_CODER; diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h index 15a2ebc65e..529798dd80 100644 --- a/src/operators/kernel/central-arm-func/pool_arm_func.h +++ b/src/operators/kernel/central-arm-func/pool_arm_func.h @@ -17,103 +17,53 @@ limitations under the License. */ #include #include +#include "common/types.h" #include "operators/math/pooling.h" namespace paddle_mobile { namespace operators { -using framework::Tensor; - -template -void PoolBasic(std::string pooling_type, std::vector ksize, - std::vector strides, std::vector paddings, - const Tensor *in_x, Tensor *out) { - if (pooling_type == "max") { - math::PoolFunctor, T> pool2d_forward; - math::MaxPool pool_process; - pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); - - } else if (pooling_type == "avg") { - math::PoolFunctor, T> pool2d_forward; - math::AvgPool pool_process; - pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); - } -} template void PoolCompute(const PoolParam ¶m) { - const Tensor *in_x = param.Input(); - Tensor *out = param.Output(); - std::string pooling_type = param.PoolingType(); - + const framework::Tensor *input = param.Input(); + framework::Tensor *output = param.Output(); + const std::string &pooling_type = param.PoolingType(); std::vector ksize = param.Ksize(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - if (ksize.size() != 2) { - LOG(paddle_mobile::LogLevel::kLOG_ERROR) - << "Pool op only supports 2D and 3D input."; - } if (param.isGlobalPooling()) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); + ksize[i] = static_cast(input->dims()[i + 2]); } } - if (in_x->type() == typeid(int8_t)) { - if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) { - if (strides[0] == strides[1] && strides[0] == 1) { - math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]); - } else if (strides[0] == strides[1] && strides[0] == 2) { - math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]); + if (ksize[0] == 3 && ksize[0] == ksize[1]) { + if (pooling_type == "max" && strides[0] == strides[1]) { + if (strides[0] == 1) { + math::Pooling3x3()(*input, paddings, output); + } else if (strides[0] == 2) { + math::Pooling3x3()(*input, paddings, output); } else { - math::Pool3x3Max_int8(strides, paddings, in_x, out); + math::Pooling()(*input, ksize, strides, paddings, output); + } + } else if (pooling_type == "avg" && strides[0] == strides[1]) { + if (strides[0] == 1) { + math::Pooling3x3()(*input, paddings, output); + } else if (strides[0] == 2) { + math::Pooling3x3()(*input, paddings, output); + } else { + math::Pooling()(*input, ksize, strides, paddings, output); } } else { - PoolBasic(pooling_type, ksize, strides, paddings, in_x, - out); + // Others } } else { - if (ksize[0] == 3 && ksize[0] == ksize[1]) { - if (pooling_type == "max") { - if (strides[0] == strides[1] && strides[0] == 1 && - paddings[0] == paddings[1] && paddings[1] == 1) { - math::Pool3x3Maxs1p1(in_x, out); - } else { - math::Pool3x3Max(strides, paddings, in_x, out); - } - } else if (pooling_type == "avg") { - if (strides[0] == strides[1] && strides[0] == 1 && - paddings[0] == paddings[1] && paddings[1] == 1) { - math::Pool3x3Avgs1p1(in_x, out); - } else { - math::Pool3x3Avg(strides, paddings, in_x, out); - } - } - - } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 && - strides[0] == strides[1] && paddings[0] == paddings[1] && - paddings[1] == 0) { -#if __ARM_NEON -#if __aarch64__ - PoolBasic(pooling_type, ksize, strides, paddings, in_x, - out); -#else - /// todo: fix bug in Pool2x2 - if (pooling_type == "max") { - math::Pool2x2Maxs2p0(strides, paddings, in_x, out); - } else if (pooling_type == "avg") { - math::Pool2x2Avgs2p0(strides, paddings, in_x, out); - } -#endif -#else - PoolBasic(pooling_type, ksize, strides, paddings, in_x, - out); -#endif // __ARM_NEON - + if (pooling_type == "max") { + math::Pooling()(*input, ksize, strides, paddings, output); + } else if (pooling_type == "avg") { + math::Pooling()(*input, ksize, strides, paddings, output); } else { - PoolBasic(pooling_type, ksize, strides, paddings, in_x, - out); + // Others } } } diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp deleted file mode 100644 index 88bf866b73..0000000000 --- a/src/operators/math/pool_2x2.cpp +++ /dev/null @@ -1,304 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP -#include "operators/math/pool_2x2.h" -#include -#include - -namespace paddle_mobile { -namespace operators { -namespace math { -#define FLT_MAX __FLT_MAX__ - -void Pool2x2Maxs2p0(vector strides, vector paddings, - const Tensor *input, Tensor *output) { - const int batch_size = input->dims()[0]; - const int input_height = input->dims()[2]; - const int input_width = input->dims()[3]; - - const int output_channels = output->dims()[1]; - int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = 2; - const int ksize_width = 2; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - - const int input_channel_stride = input_height * input_width; - const int output_channel_stride = output_height * output_width; - - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - - int w1 = input_width / 16; - int _w1 = input_width % 16; - int w2 = _w1 / 4; - int _w2 = _w1 % 4; - - for (int i = 0; i < batch_size; ++i) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < input_height; ph += 2) { - const float *in_ptr1 = input_data + i * input_batch_stride + - c * input_channel_stride + ph * input_width; - const float *in_ptr2 = in_ptr1 + input_width; - if (ph != input_height && ph + 1 >= input_height) { - in_ptr2 = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * input_width)); - memset(static_cast(const_cast(in_ptr2)), -FLT_MAX, - sizeof(float) * input_width); - } - float *out_ptr = output_data + i * output_batch_stride + - c * output_channel_stride + ph / 2 * output_width; -#if __ARM_NEON -#if __aarch64__ -#else - asm volatile( - "subs %[w1], %[w1], #1 \n\t" - "blt end_w1_%= \n\t" - "loop_w1_%=: \n\t" - - "pld [%[in_ptr1], #64] \n\t" - "pld [%[in_ptr2], #64] \n\t" - - "vld1.f32 {q0, q1}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q2, q3}, [%[in_ptr2]]! \n\t" - "vld1.f32 {q6, q7}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q8, q9}, [%[in_ptr2]]! \n\t" - - "vmax.f32 q0, q0, q2 \n\t" - "vmax.f32 q1, q1, q3 \n\t" - - "vmax.f32 q6, q6, q8 \n\t" - "vmax.f32 q7, q7, q9 \n\t" - - "vpmax.f32 d8, d0, d1 \n\t" - "vpmax.f32 d9, d2, d3 \n\t" - - "vpmax.f32 d10, d12, d13 \n\t" - "vpmax.f32 d11, d14, d15 \n\t" - - "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" - - "subs %[w1], %[w1], #1 \n\t" - "bge loop_w1_%= \n\t" - "end_w1_%=: \n\t" - - "subs %[w2], %[w2], #1 \n\t" - "blt end_w2_%= \n\t" - "loop_w2_%=: \n\t" - - "vld1.f32 {q0}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q1}, [%[in_ptr2]]! \n\t" - "vmax.f32 q0, q0, q1 \n\t" - "vpmax.f32 d4, d0, d1 \n\t" - "vst1.32 {d4}, [%[out_ptr]]! \n\t" - - "subs %[w2], %[w2], #1 \n\t" - "bge loop_w2_%= \n\t" - "end_w2_%=: \n\t" - : - : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1), - [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9"); -#endif -#endif - - if (_w2 != 0) { - in_ptr1 = input_data + i * input_batch_stride + - c * input_channel_stride + ph * input_width + 16 * w1 + - 4 * w2; - in_ptr2 = in_ptr1 + input_width; - out_ptr = output_data + i * output_batch_stride + - c * output_channel_stride + ph / 2 * output_width + 8 * w1 + - 2 * w2; - if (_w2 == 1) { - *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - } else if (_w2 == 2) { - float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - in_ptr1++; - in_ptr2++; - float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - *out_ptr = (temp > temp1) ? temp : temp1; - } else if (_w2 == 3) { - float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - in_ptr1++; - in_ptr2++; - float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - in_ptr1++; - in_ptr2++; - *out_ptr = (temp > temp1) ? temp : temp1; - out_ptr++; - *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; - } - } - } - } - } -} - -void Pool2x2Avgs2p0(vector strides, vector paddings, - const Tensor *input, Tensor *output) { - const int batch_size = input->dims()[0]; - const int input_height = input->dims()[2]; - const int input_width = input->dims()[3]; - - const int output_channels = output->dims()[1]; - int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = 2; - const int ksize_width = 2; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - - const int input_channel_stride = input_height * input_width; - const int output_channel_stride = output_height * output_width; - - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - - int w1 = input_width / 16; - int _w1 = input_width % 16; - int w2 = _w1 / 4; - int _w2 = _w1 % 4; - - float quarter = 0.25; - for (int i = 0; i < batch_size; ++i) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < input_height; ph += 2) { - const float *in_ptr1 = input_data + i * input_batch_stride + - c * input_channel_stride + ph * input_width; - const float *in_ptr2 = in_ptr1 + input_width; - if (ph + 1 >= input_height) { - in_ptr2 = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * input_width)); - memset(static_cast(const_cast(in_ptr2)), 0, - sizeof(float) * input_width); - } - float *out_ptr = output_data + i * output_batch_stride + - c * output_channel_stride + ph / 2 * output_width; -#if __ARM_NEON -#if __aarch64__ -#else - asm volatile( - "subs %[w1], %[w1], #1 \n\t" - "blt end_w1_%= \n\t" - "loop_w1_%=: \n\t" - - "pld [%[in_ptr1], #64] \n\t" - "pld [%[in_ptr2], #64] \n\t" - - "vmov.f32 d0[0], %[quarter] \n\t" - "vld1.f32 {q1, q2}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q3, q4}, [%[in_ptr2]]! \n\t" - "vld1.f32 {q7, q8}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q9, q10}, [%[in_ptr2]]! \n\t" - - "vadd.f32 q1, q1, q3 \n\t" - "vadd.f32 q2, q2, q4 \n\t" - - "vadd.f32 q7, q7, q9 \n\t" - "vadd.f32 q8, q8, q10 \n\t" - - "vpadd.f32 d10, d2, d3 \n\t" - "vpadd.f32 d11, d4, d5 \n\t" - - "vpadd.f32 d12, d14, d15 \n\t" - "vpadd.f32 d13, d16, d17 \n\t" - - "vmul.f32 q5, q5, d0[0] \n\t" - "vmul.f32 q6, q6, d0[0] \n\t" - - "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" - - "subs %[w1], %[w1], #1 \n\t" - "bge loop_w1_%= \n\t" - "end_w1_%=: \n\t" - - "subs %[w2], %[w2], #1 \n\t" - "blt end_w2_%= \n\t" - "loop_w2_%=: \n\t" - - "vld1.f32 {q1}, [%[in_ptr1]]! \n\t" - "vld1.f32 {q2}, [%[in_ptr2]]! \n\t" - "vadd.f32 q1, q1, q2 \n\t" - "vpadd.f32 d4, d2, d3 \n\t" - "vmul.f32 d4, d4, d0[0] \n\t" - "vst1.32 {d4}, [%[out_ptr]]! \n\t" - - "subs %[w2], %[w2], #1 \n\t" - "bge loop_w2_%= \n\t" - "end_w2_%=: \n\t" - : - : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1), - [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr), - [quarter] "r"(quarter) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10"); -#endif -#endif - - if (_w2 != 0) { - in_ptr1 = input_data + i * input_batch_stride + - c * input_channel_stride + ph * input_width + 16 * w1 + - 4 * w2; - in_ptr2 = in_ptr1 + input_width; - out_ptr = output_data + i * output_batch_stride + - c * output_channel_stride + ph / 2 * output_width + 8 * w1 + - 2 * w2; - if (_w2 == 1) { - *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); - } else if (_w2 == 2) { - float temp = 0; - temp += *in_ptr1; - temp += *in_ptr2; - in_ptr1++; - in_ptr2++; - temp += *in_ptr1; - temp += *in_ptr2; - *out_ptr = 0.25 * temp; - } else if (_w2 == 3) { - float temp = 0; - temp += *in_ptr1++; - temp += *in_ptr2++; - temp += *in_ptr1++; - temp += *in_ptr2++; - *out_ptr = 0.25 * temp; - out_ptr++; - *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); - } - } - } - } - } -} - -//} -} // namespace math - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h deleted file mode 100644 index bd5e484826..0000000000 --- a/src/operators/math/pool_2x2.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once - -#include "framework/tensor.h" -#ifdef __ARM_NEON -#include -#endif // __ARM_NEON -namespace paddle_mobile { -namespace operators { -namespace math { -using framework::Tensor; -using std::vector; - -void Pool2x2Maxs2p0(vector strides, vector paddings, - const Tensor *input, Tensor *output); - -void Pool2x2Avgs2p0(vector strides, vector paddings, - const Tensor *in_x, Tensor *out); -} // namespace math -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp deleted file mode 100644 index a2b84a5b14..0000000000 --- a/src/operators/math/pool_3x3.cpp +++ /dev/null @@ -1,904 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP -#ifdef _OPENMP -#include -#endif -#include "framework/tensor.h" -#include "operators/math/pool_3x3.h" -#if __ARM_NEON -#include -#endif // __ARM_NEON -#include -namespace paddle_mobile { -namespace operators { -namespace math { -using framework::Tensor; -using std::max; -using std::min; -using std::vector; -void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { -#if __ARM_NEON - const int batch_size = static_cast(input->dims()[0]); - const int input_channel = static_cast(input->dims()[1]); - - const int input_height = static_cast(input->dims()[2]); - const int input_width = static_cast(input->dims()[3]); - const int output_height = static_cast(output->dims()[2]); - const int output_width = static_cast(output->dims()[3]); - output->mutable_data(); - - const int hxw = input_height * input_width; - - const int l = input_height; - - const float coef = 1.0 / 9.0; - const float coef1 = 1.0 / 6.0; - const float coef2 = 1.0 / 4.0; - - float32x4_t v_coef = vdupq_n_f32(coef); - float32x4_t v_coef1 = vdupq_n_f32(coef1); - - for (int b = 0; b < batch_size; b++) { -#pragma omp parallel for - for (int c = 0; c < input_channel; c++) { - const float *input_data = input->data() + c * hxw; - float *output_data = output->data() + c * hxw; - - for (int i = 1; i < output_height - 1; i++) { - float *output_ptr; - float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4, - tmp5, out0; - for (int m = 1; m < output_width - 4; m += 4) { - output_ptr = output_data + i * output_width + m; - in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1); - in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3); - in2 = vld1q_f32(input_data + i * input_width + m - 1); - in3 = vld1q_f32(input_data + i * input_width + m + 3); - in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1); - in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3); - - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - tmp4 = vextq_f32(in4, in5, 1); - tmp5 = vextq_f32(in4, in5, 2); - - out0 = in0; - out0 = vaddq_f32(out0, tmp0); - out0 = vaddq_f32(out0, tmp1); - out0 = vaddq_f32(out0, in2); - out0 = vaddq_f32(out0, tmp2); - out0 = vaddq_f32(out0, tmp3); - out0 = vaddq_f32(out0, in4); - out0 = vaddq_f32(out0, tmp4); - out0 = vaddq_f32(out0, tmp5); - - vst1q_f32(output_ptr, vmulq_f32(out0, v_coef)); - } - int m; - for (m = 1; (m + 3) < output_width - 1; m = m + 4) { - } - - for (int j = m; j < output_width - 1; j++) { - output_data[i * output_width + j] = - input_data[(i - 1) * input_width + j - 1] + - input_data[(i - 1) * input_width + j] + - input_data[(i - 1) * input_width + j + 1] + - input_data[(i)*input_width + j - 1] + - input_data[(i)*input_width + j] + - input_data[(i)*input_width + j + 1] + - input_data[(i + 1) * input_width + j - 1] + - input_data[(i + 1) * input_width + j] + - input_data[(i + 1) * input_width + j + 1]; - output_data[i * output_width + j] = - output_data[i * output_width + j] * coef; - } - } - - output_data[0] = - input_data[0] + input_data[1] + input_data[l] + input_data[l + 1]; - output_data[l - 1] = input_data[l - 2] + input_data[l - 1] + - input_data[2 * l - 2] + input_data[2 * l - 1]; - output_data[(l - 1) * l] = - input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] + - input_data[(l - 1) * l] + input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] + - input_data[(l - 2) * (l + 1) + 1] + - input_data[l * l - 2] + input_data[l * l - 1]; - output_data[0] = output_data[0] * coef2; - output_data[l - 1] = output_data[l - 1] * coef2; - output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2; - output_data[l * l - 1] = output_data[l * l - 1] * coef2; - - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] + - input_data[i * l] + input_data[i * l + 1] + - input_data[i * l + l] + input_data[i * l + l + 1]; - - output_data[i * l + l - 1] = - input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] + - input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] + - input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l]; - output_data[i * l] = output_data[i * l] * coef1; - output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1; - } - - int m; - for (m = 1; m < output_width - 4; m += 4) { - float *output_ptr = output_data + m; - float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0; - in0 = vld1q_f32(input_data + m - 1); - in1 = vld1q_f32(input_data + m + 3); - in2 = vld1q_f32(input_data + input_width + m - 1); - in3 = vld1q_f32(input_data + input_width + m + 3); - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - out0 = in0; - out0 = vaddq_f32(out0, tmp0); - out0 = vaddq_f32(out0, tmp1); - out0 = vaddq_f32(out0, in2); - out0 = vaddq_f32(out0, tmp2); - out0 = vaddq_f32(out0, tmp3); - - vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1)); - } - - for (m = 1; (m + 3) < output_width - 1; m += 4) { - } - for (int j = m; j < output_width - 1; j++) { - output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] + - input_data[input_width + j - 1] + - input_data[input_width + j] + - input_data[input_width + j + 1]; - output_data[j] = output_data[j] * coef1; - } - - for (m = 1; m < output_width - 4; m += 4) { - float *output_ptr = - output_data + (output_height - 1) * output_width + m; - - float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0; - in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1); - in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3); - in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1); - in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3); - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - out0 = in0; - out0 = vaddq_f32(out0, tmp0); - out0 = vaddq_f32(out0, tmp1); - out0 = vaddq_f32(out0, in2); - out0 = vaddq_f32(out0, tmp2); - out0 = vaddq_f32(out0, tmp3); - - vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1)); - } - for (m = 1; (m + 3) < output_width - 1; m = m + 4) { - } - for (int j = m; j < output_width - 1; j++) { - output_data[(output_height - 1) * input_width + j] = - input_data[(output_height - 2) * input_width + j - 1] + - input_data[(output_height - 2) * input_width + j] + - input_data[(output_height - 2) * input_width + j + 1] + - input_data[(output_height - 1) * input_width + j - 1] + - input_data[(output_height - 1) * input_width + j] + - input_data[(output_height - 1) * input_width + j + 1]; - output_data[(output_height - 1) * output_width + j] = - output_data[(output_height - 1) * output_width + j] * coef1; - } - } - } - -// const int batch_size = input->dims()[0]; -// -// const int h_in = input->dims()[2]; -// -// const int w_in = input->dims()[3]; -// -// const int output_channels = output->dims()[1]; -// -// const int h_out = output->dims()[2]; -// const int w_out = output->dims()[3]; -// const int outputdata_channel_stride = h_out * w_out; -// const int inputdata_channel_stride = h_in * w_in; -// const int input_batch_stride = output_channels * inputdata_channel_stride; -// const int output_batch_stride = output_channels * -// outputdata_channel_stride; float *out_data = output->data(); const -// float *input_data = input->data(); -// -// const float coef = 1.0 / 9.0; -// for (int k = 0; k < batch_size; ++k) { -// #pragma omp parallel for -// for (int c = 0; c < output_channels; ++c) { -// const float *input_seg = input_data + c * inputdata_channel_stride; -// float *output_seg = out_data + c * outputdata_channel_stride; -// // four corner point -// output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] + -// input_seg[w_in + 1]) * -// coef; -// output_seg[w_out - 1] = -// (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - -// 2] + -// input_seg[2 * w_in - 1]) * -// coef; -// output_seg[(h_out - 1) * w_out] = -// (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] + -// input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) -// * -// coef; -// output_seg[h_out * w_out - 1] = -// (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] + -// input_seg[(h_in - 1) * w_in - 1] + -// input_seg[(h_in - 1) * w_in - 2]) * -// coef; -// // left side & right side -// for (int i = 1; i < h_in - 1; ++i) { -// output_seg[i * w_out] = -// (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] + -// input_seg[i * w_in] + input_seg[i * w_in + 1] + -// input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) * -// coef; -// output_seg[i * w_out + w_out - 1] = -// (input_seg[i * w_in - w_in + w_in - 2] + -// input_seg[i * w_in - w_in + 1 + w_in - 2] + -// input_seg[i * w_in + w_in - 2] + -// input_seg[i * w_in + 1 + w_in - 2] + -// input_seg[i * w_in + w_in + w_in - 2] + -// input_seg[i * w_in + w_in + 1 + w_in - 2]) * -// coef; -// } -// // top 1 row & bottom 1 row -// const float *input_tmp = input_seg; -// -// float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, -// tmp3, tmp4, tmp5, sum, out0; -// float32x4_t v_coef = vdupq_n_f32(coef); -// in0 = vld1q_f32(input_tmp); -// in2 = vld1q_f32(input_tmp + w_in); -// const float *input_tmp_end = input_tmp + (h_in - 2) * w_in; -// in4 = vld1q_f32(input_tmp_end); -// in6 = vld1q_f32(input_tmp_end + w_in); -// int c_mid = w_out - 2; -// auto output_ptr = output_seg + 1; -// for (; c_mid > 3; c_mid -= 4) { -// in1 = vld1q_f32(input_tmp + 4); -// in3 = vld1q_f32(input_tmp + w_in + 4); -// -// tmp0 = vextq_f32(in0, in1, 1); -// tmp1 = vextq_f32(in0, in1, 2); -// -// tmp2 = vextq_f32(in2, in3, 1); -// tmp3 = vextq_f32(in2, in3, 2); -// -// sum = vaddq_f32(in0, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in2); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// -// vst1q_f32(output_ptr, vmulq_f32(sum, v_coef)); -// -// in5 = vld1q_f32(input_tmp_end + 4); -// in7 = vld1q_f32(input_tmp_end + w_in + 4); -// -// tmp0 = vextq_f32(in4, in5, 1); -// tmp1 = vextq_f32(in4, in5, 2); -// tmp2 = vextq_f32(in6, in7, 1); -// tmp3 = vextq_f32(in6, in7, 2); -// -// sum = vaddq_f32(in0, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in2); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// -// vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef)); -// -// // can optimize to each 8 stride. -// input_tmp += 4; -// input_tmp_end += 4; -// output_ptr += 4; -// in0 = in1; -// in2 = in3; -// in4 = in5; -// in6 = in7; -// } -// // top right remain -// float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]); -// float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]); -// -// tmp0 = vextq_f32(in0, pad0, 1); -// tmp1 = vextq_f32(in0, pad0, 2); -// tmp2 = vextq_f32(in2, pad1, 2); -// tmp3 = vextq_f32(in2, pad1, 2); -// -// sum = vaddq_f32(in0, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in2); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// out0 = vmulq_f32(sum, v_coef); -// -// for (int i = 0; i < c_mid; ++i) { -// if (i == 0) { -// vst1q_lane_f32(output_ptr + i, out0, 0); -// } -// if (i == 1) { -// vst1q_lane_f32(output_ptr + i, out0, 1); -// } -// if (i == 2) { -// vst1q_lane_f32(output_ptr + i, out0, 2); -// } -// } -// -// // bottom_right remain -// float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]); -// float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]); -// -// tmp0 = vextq_f32(in4, pad2, 1); -// tmp1 = vextq_f32(in4, pad2, 2); -// tmp2 = vextq_f32(in6, pad3, 2); -// tmp3 = vextq_f32(in6, pad3, 2); -// -// sum = vaddq_f32(in4, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in6); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// out0 = vmulq_f32(sum, v_coef); -// -// for (int i = 0; i < c_mid; ++i) { -// if (i == 0) { -// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0); -// } -// if (i == 1) { -// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1); -// } -// if (i == 2) { -// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2); -// } -// } -// // mid -// for (int j = 0; j < h_out - 2; ++j) { -// output_ptr = output_seg + w_out * (j + 1) + 1; -// input_tmp = input_seg + j * w_in; -// -// in0 = vld1q_f32(input_tmp); -// in2 = vld1q_f32(input_tmp + w_in); -// in4 = vld1q_f32(input_tmp + 2 * w_in); -// c_mid = w_out - 2; -// for (; c_mid > 3; c_mid -= 4) { -// in1 = vld1q_f32(input_tmp + 4); -// in3 = vld1q_f32(input_tmp + w_in + 4); -// in5 = vld1q_f32(input_tmp + 2 * w_in + 4); -// -// tmp0 = vextq_f32(in0, in1, 1); -// tmp1 = vextq_f32(in0, in1, 2); -// tmp2 = vextq_f32(in2, in3, 1); -// tmp3 = vextq_f32(in2, in3, 2); -// tmp4 = vextq_f32(in4, in5, 1); -// tmp5 = vextq_f32(in4, in5, 2); -// -// sum = vaddq_f32(in0, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in2); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// sum = vaddq_f32(sum, in4); -// sum = vaddq_f32(sum, tmp4); -// sum = vaddq_f32(sum, tmp5); -// -// out0 = vmulq_f32(sum, v_coef); -// vst1q_f32(output_ptr, out0); -// output_ptr += 4; -// input_tmp += 4; -// in0 = in1; -// in2 = in3; -// in4 = in5; -// } -// // mid remain -// float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]); -// float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); -// float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); -// -// tmp0 = vextq_f32(in0, pad0, 1); -// tmp1 = vextq_f32(in0, pad0, 2); -// tmp2 = vextq_f32(in2, pad1, 1); -// tmp3 = vextq_f32(in2, pad1, 2); -// tmp4 = vextq_f32(in4, pad2, 1); -// tmp5 = vextq_f32(in4, pad2, 2); -// -// sum = vaddq_f32(in0, tmp0); -// sum = vaddq_f32(sum, tmp1); -// sum = vaddq_f32(sum, in2); -// sum = vaddq_f32(sum, tmp2); -// sum = vaddq_f32(sum, tmp3); -// sum = vaddq_f32(sum, in4); -// sum = vaddq_f32(sum, tmp4); -// sum = vaddq_f32(sum, tmp5); -// out0 = vmulq_f32(sum, v_coef); -// -// for (int i = 0; i < c_mid; ++i) { -// if (i == 0) { -// vst1q_lane_f32(output_ptr + i, out0, 0); -// } -// if (i == 1) { -// vst1q_lane_f32(output_ptr + i, out0, 1); -// } -// if (i == 2) { -// vst1q_lane_f32(output_ptr + i, out0, 2); -// } -// } -// } -// // input_data += inputdata_channel_stride; -// // out_data += outputdata_channel_stride; -// } -// input_data += input_batch_stride; -// out_data += output_batch_stride; -// } -#endif -} - -void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { -#if __ARM_NEON - const int batch_size = input->dims()[0]; - - const int h_in = input->dims()[2]; - - const int w_in = input->dims()[3]; - - const int output_channels = output->dims()[1]; - - const int h_out = output->dims()[2]; - const int w_out = output->dims()[3]; - const int outputdata_channel_stride = h_out * w_out; - const int inputdata_channel_stride = h_in * w_in; - const int input_batch_stride = output_channels * inputdata_channel_stride; - const int output_batch_stride = output_channels * outputdata_channel_stride; - float *out_data = output->mutable_data(); - const float *input_data = input->data(); - for (int k = 0; k < batch_size; ++k) { -#pragma omp parallel for - for (int c = 0; c < output_channels; ++c) { - const float *input_seg = input_data + c * inputdata_channel_stride; - float *output_seg = out_data + c * outputdata_channel_stride; - // four corner point - output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]), - std::max(input_seg[w_in], input_seg[w_in + 1])); - output_seg[w_out - 1] = - std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]), - std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1])); - output_seg[(h_out - 1) * w_out] = - std::max(std::max(input_seg[(h_in - 2) * w_in], - input_seg[(h_in - 2) * w_in + 1]), - std::max(input_seg[(h_in - 1) * w_in], - input_seg[(h_in - 1) * w_in + 1])); - output_seg[h_out * w_out - 1] = std::max( - std::max(input_seg[(h_in - 1) * w_in - 1], - input_seg[(h_in - 1) * w_in - 2]), - std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2])); - // left side & right side - for (int i = 1; i < h_in - 1; ++i) { - float max1 = std::max(input_seg[i * w_in - w_in], - input_seg[i * w_in - w_in + 1]); - float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]); - float max3 = std::max(input_seg[i * w_in + w_in], - input_seg[i * w_in + w_in + 1]); - output_seg[i * w_out] = std::max(std::max(max1, max2), max3); - - max1 = std::max(input_seg[i * w_in - w_in + w_in - 2], - input_seg[i * w_in - w_in + 1 + w_in - 2]); - max2 = std::max(input_seg[i * w_in + w_in - 2], - input_seg[i * w_in + 1 + w_in - 2]); - max3 = std::max(input_seg[i * w_in + w_in + w_in - 2], - input_seg[i * w_in + w_in + 1 + w_in - 2]); - output_seg[i * w_out + w_out - 1] = - std::max(std::max(max1, max2), max3); - } - // top 1 row & bottom 1 row - const float *input_tmp = input_seg; - - float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, - tmp3, tmp4, tmp5, max; - in0 = vld1q_f32(input_tmp); - in2 = vld1q_f32(input_tmp + w_in); - const float *input_tmp_end = input_tmp + (h_in - 2) * w_in; - in4 = vld1q_f32(input_tmp_end); - in6 = vld1q_f32(input_tmp_end + w_in); - int c_mid = w_out - 2; - auto output_ptr = output_seg + 1; - for (; c_mid > 3; c_mid -= 4) { - in1 = vld1q_f32(input_tmp + 4); - in3 = vld1q_f32(input_tmp + w_in + 4); - - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - - max = vmaxq_f32(in0, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in2); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - - vst1q_f32(output_ptr, max); - - in5 = vld1q_f32(input_tmp_end + 4); - in7 = vld1q_f32(input_tmp_end + w_in + 4); - - tmp0 = vextq_f32(in4, in5, 1); - tmp1 = vextq_f32(in4, in5, 2); - tmp2 = vextq_f32(in6, in7, 1); - tmp3 = vextq_f32(in6, in7, 2); - - max = vmaxq_f32(in4, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in6); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - - vst1q_f32(output_ptr + (h_out - 1) * w_out, max); - - input_tmp += 4; - input_tmp_end += 4; - output_ptr += 4; - in0 = in1; - in2 = in3; - in4 = in5; - in6 = in7; - } - // top right remain - float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]); - float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]); - - tmp0 = vextq_f32(in0, pad0, 1); - tmp1 = vextq_f32(in0, pad0, 2); - tmp2 = vextq_f32(in2, pad1, 1); - tmp3 = vextq_f32(in2, pad1, 2); - - max = vmaxq_f32(in0, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in2); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + i, max, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + i, max, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + i, max, 2); - } - } - - // bottom_right remain - float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]); - float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]); - - tmp0 = vextq_f32(in4, pad2, 1); - tmp1 = vextq_f32(in4, pad2, 2); - tmp2 = vextq_f32(in6, pad3, 1); - tmp3 = vextq_f32(in6, pad3, 2); - - max = vmaxq_f32(in4, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in6); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 2); - } - } - // mid - for (int j = 0; j < h_out - 2; ++j) { - output_ptr = output_seg + (j + 1) * w_out + 1; - input_tmp = input_seg + j * w_in; - - in0 = vld1q_f32(input_tmp); - in2 = vld1q_f32(input_tmp + w_in); - in4 = vld1q_f32(input_tmp + 2 * w_in); - c_mid = w_out - 2; - for (; c_mid > 3; c_mid -= 4) { - in1 = vld1q_f32(input_tmp + 4); - in3 = vld1q_f32(input_tmp + w_in + 4); - in5 = vld1q_f32(input_tmp + 2 * w_in + 4); - - tmp0 = vextq_f32(in0, in1, 1); - tmp1 = vextq_f32(in0, in1, 2); - tmp2 = vextq_f32(in2, in3, 1); - tmp3 = vextq_f32(in2, in3, 2); - tmp4 = vextq_f32(in4, in5, 1); - tmp5 = vextq_f32(in4, in5, 2); - - max = vmaxq_f32(in0, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in2); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - max = vmaxq_f32(max, in4); - max = vmaxq_f32(max, tmp4); - max = vmaxq_f32(max, tmp5); - - vst1q_f32(output_ptr, max); - output_ptr += 4; - input_tmp += 4; - in0 = in1; - in2 = in3; - in4 = in5; - } - // mid remain - float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]); - float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]); - float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]); - - tmp0 = vextq_f32(in0, pad0, 1); - tmp1 = vextq_f32(in0, pad0, 2); - tmp2 = vextq_f32(in2, pad1, 1); - tmp3 = vextq_f32(in2, pad1, 2); - tmp4 = vextq_f32(in4, pad2, 1); - tmp5 = vextq_f32(in4, pad2, 2); - - max = vmaxq_f32(in0, tmp0); - max = vmaxq_f32(max, tmp1); - max = vmaxq_f32(max, in2); - max = vmaxq_f32(max, tmp2); - max = vmaxq_f32(max, tmp3); - max = vmaxq_f32(max, in4); - max = vmaxq_f32(max, tmp4); - max = vmaxq_f32(max, tmp5); - - for (int i = 0; i < c_mid; ++i) { - if (i == 0) { - vst1q_lane_f32(output_ptr + i, max, 0); - } - if (i == 1) { - vst1q_lane_f32(output_ptr + i, max, 1); - } - if (i == 2) { - vst1q_lane_f32(output_ptr + i, max, 2); - } - } - } - // input_data += inputdata_channel_stride; - // out_data += outputdata_channel_stride; - } - input_data += input_batch_stride; - out_data += output_batch_stride; - } -#else - -#endif -} - -void Pool3x3Max(vector strides, vector paddings, const Tensor *input, - Tensor *output) { -#if __ARM_NEON - const int batch_size = input->dims()[0]; - - const int input_height = input->dims()[2]; - - const int input_width = input->dims()[3]; - - const int output_channels = output->dims()[1]; - - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - // const int _kernel_size = 3; - const int stride = strides[0]; - // const int stride_width = strides[1]; - const int padding = paddings[0]; - // const int padding_width = paddings[1]; - const float negative_max = -INT_MAX; - const int input_channel_stride = input_height * input_width; - const int output_channel_stride = output_height * output_width; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - const float *pos1, *output_ptr; - int hstart, wstart, hend, wend; - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int c = 0; c < output_channels; ++c) { - const float *input_seg = input_data + c * input_channel_stride; - float *output_seg = output_data + c * output_channel_stride; - for (int ph = 0; ph < output_height; ph++) { - int hstart = ph * stride - padding; - int hend = min(hstart + 3, input_height); - hstart = max(hstart, 0); - for (int pw = 0; pw < output_width; pw++) { - int wstart = pw * stride - padding; - int wend = min(wstart + 3, input_width); - wstart = max(wstart, 0); - const float *pos1 = input_seg + hstart * input_width + wstart; - const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; - const float *pos3 = input_seg + (hstart + 2) * input_width + wstart; - output_ptr = output_seg + ph * output_width + pw; - - if (hend - hstart != 3 || wend - wstart != 3) { - float max_value = -INT_MAX; - for (int h = hstart; h < hend; h++) { - for (int w = wstart; w < wend; w++) { - float value = input_seg[h * input_width + w]; - if (value > max_value) { - max_value = value; - } - } - } - output_seg[ph * output_width + pw] = max_value; - } else { -#if __aarch64__ - const float32x4_t data1 = vld1q_f32(pos1); - const float32x4_t data2 = vld1q_f32(pos1 + input_width); - const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width); - const float32x4_t max_data = - vmaxq_f32(vmaxq_f32(data1, data2), data3); - float32x2_t res = - vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)), - vget_low_f32(max_data)); - res = vpmax_f32(res, res); - output_seg[ph * output_width + pw] = vget_lane_f32(res, 0); -#else - asm volatile( - "vld1.32 {q1}, [%[pos1]] \n\t" - "vld1.32 {q2}, [%[pos2]] \n\t" - "vld1.32 {q3}, [%[pos3]] \n\t" - "vmax.f32 q1, q1, q2 \n\t" - "vmax.f32 q2, q1, q3 \n\t" - "vmov.f32 d5[1], %[negative_max] \n\t" - "vpmax.f32 d6, d4, d5 \n\t" - "vpmax.f32 d7, d6, d6 \n\t" - "vst1.32 {d7[0]},[%[output_ptr]] \n\t" - : - : [input_seg] "r"(input_seg), [pos1] "r"(pos1), - [pos2] "r"(pos2), [pos3] "r"(pos3), - [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) - : "memory", "q1", "q2", "q3", "q4"); -#endif - } - } - } - } - input_data += input_batch_stride; - output_data += output_batch_stride; - } -#endif -} - -void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, - Tensor *output) { -#if __ARM_NEON - const int batch_size = input->dims()[0]; - - const int input_height = input->dims()[2]; - - const int input_width = input->dims()[3]; - - const int output_channels = output->dims()[1]; - - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int stride = strides[0]; - const int padding = paddings[0]; - - const int input_channel_stride = input_height * input_width; - const int output_channel_stride = output_height * output_width; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - const float zero = 0; - const float nine = 1.0 / 9.0; - const float nine_ptr[] = {nine, nine}; - - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int c = 0; c < output_channels; ++c) { - const float *input_seg = input_data + c * input_channel_stride; - float *output_seg = output_data + c * output_channel_stride; - for (int ph = 0; ph < output_height; ph++) { - for (int pw = 0; pw < output_width; pw++) { - int hstart = ph * stride - padding; - int wstart = pw * stride - padding; - int hend = min(hstart + 3, input_height + padding); - int wend = min(wstart + 3, input_width + padding); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, input_height); - wend = min(wend, input_width); - - const float *pos1 = input_seg + hstart * input_width + wstart; - const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; - const float *pos3 = input_seg + (hstart + 2) * input_width + wstart; - float *output_ptr = output_seg + ph * output_width + pw; - - if (hend - hstart != 3 || wend - wstart != 3) { - float sum = 0; - for (int h = hstart; h < hend; h++) { - for (int w = wstart; w < wend; w++) { - sum += input_seg[h * input_width + w]; - } - } - output_seg[ph * output_width + pw] = - sum / ((hend - hstart) * (wend - wstart) * 1.0); - } else { -#if __aarch64__ -#else - asm volatile( - "vld1.32 {q1}, [%[pos1]] \n\t" - "vld1.32 {q2}, [%[pos2]] \n\t" - "vld1.32 {q3}, [%[pos3]] \n\t" - "vadd.f32 q1, q1, q2 \n\t" - "vadd.f32 q2, q1, q3 \n\t" - "vmov.f32 d5[1], %[zero] \n\t" - "vpadd.f32 d6, d4, d5 \n\t" - "vpadd.f32 d6, d6, d6 \n\t" - "vld1.f32 d7, [%[nine_ptr]]! \n\t" - "vmul.f32 d6,d7 \n\t" - "vst1.32 {d6[0]},[%[output_ptr]] \n\t" - : - : [input_seg] "r"(input_seg), [pos1] "r"(pos1), - [pos2] "r"(pos2), [pos3] "r"(pos3), - [output_ptr] "r"(output_ptr), [zero] "r"(zero), - [nine_ptr] "r"(nine_ptr) - : "memory", "r6", "q1", "q2", "q3", "q4"); -#endif - const float32x4_t data1 = vld1q_f32(pos1); - const float32x4_t data2 = vld1q_f32(pos2); - const float32x4_t data3 = vld1q_f32(pos3); - const float32x4_t sum_data = - vaddq_f32(vaddq_f32(data1, data3), data2); - float32x2_t res = - vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)), - vget_low_f32(sum_data)); - res = vpadd_f32(res, res); - output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0; - } - } - } - } - input_data += input_batch_stride; - output_data += output_batch_stride; - } -#else -#endif -} -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h deleted file mode 100644 index a13cb6ab37..0000000000 --- a/src/operators/math/pool_3x3.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once -#ifdef _OPENMP -#include -#endif -#include -#include -#include "framework/tensor.h" -#if __ARM_NEON -#include -#endif // __ARM_NEON - -namespace paddle_mobile { -namespace operators { -namespace math { -void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output); -void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output); -void Pool3x3Max(std::vector strides, std::vector paddings, - const framework::Tensor *input, framework::Tensor *output); - -void Pool3x3Avg(std::vector strides, std::vector paddings, - const framework::Tensor *in_x, framework::Tensor *out); - -void Pool3x3Maxs1_int8(const framework::Tensor *input, - framework::Tensor *output, int32_t pad_h, int32_t pad_w); -void Pool3x3Maxs2_int8(const framework::Tensor *input, - framework::Tensor *output, int32_t pad_h, int32_t pad_w); -void Pool3x3Max_int8(const std::vector &strides, - const std::vector &paddings, - const framework::Tensor *input, framework::Tensor *output); -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/pool_3x3_int8.cpp b/src/operators/math/pool_3x3_int8.cpp deleted file mode 100644 index d344c489ae..0000000000 --- a/src/operators/math/pool_3x3_int8.cpp +++ /dev/null @@ -1,564 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef POOL_OP -#ifdef _OPENMP -#include -#endif -#include "framework/tensor.h" -#include "operators/math/pool_3x3.h" -#if __ARM_NEON -#include -#endif // __ARM_NEON -#include -#include -namespace paddle_mobile { -namespace operators { -namespace math { -using framework::Tensor; -using std::max; -using std::min; -using std::vector; -template -static void make_paddings(const Tensor *input, Tensor *padded_input, - int32_t top, int32_t bottom, int32_t left, - int32_t right, T value) { - const int32_t batch_size = input->dims()[0]; - const int32_t c_in = input->dims()[1]; - const int32_t h_in = input->dims()[2]; - const int32_t w_in = input->dims()[3]; - const int32_t h_padded = h_in + top + bottom; - const int32_t w_padded = w_in + left + right; - padded_input->Resize({batch_size, c_in, h_padded, w_padded}); - T *padded_input_data = padded_input->mutable_data(); - const T *input_data = input->data(); - const int32_t input_channel_stride = h_in * w_in; - const int32_t input_batch_stride = c_in * input_channel_stride; - const int32_t padded_channel_stride = h_padded * w_padded; - const int32_t padded_batch_stride = c_in * padded_channel_stride; - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int j = 0; j < c_in; ++j) { - const T *img_in = input_data + j * input_channel_stride; - T *img_padded = padded_input_data + j * padded_channel_stride; - int k = 0; - for (; k < top; ++k) { - for (int l = 0; l < w_padded; ++l) { - img_padded[l] = value; - } - img_padded += w_padded; - } - for (; k < top + h_in; ++k) { - int l = 0; - for (; l < left; ++l) { - img_padded[l] = value; - } - memcpy(img_padded + left, img_in, w_in * sizeof(T)); - l += w_in; - img_in += w_in; - for (; l < w_padded; ++l) { - img_padded[l] = value; - } - img_padded += w_padded; - } - for (; k < h_padded; ++k) { - for (int l = 0; l < w_padded; ++l) { - img_padded[l] = value; - } - img_padded += w_padded; - } - } - input_data += input_batch_stride; - padded_input_data += padded_batch_stride; - } - // input_data = input->data(); - // std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++" - // << std::endl; - // for (int i = 0; i < 1; ++i) { - // for (int j = 0; j < 1; ++j) { - // const T *img_in = input_data + j * input_channel_stride; - // for (int k = 0; k < h_in; ++k) { - // for (int l = 0; l < w_in; ++l) { - // std::cout << (int32_t)*img_in << "\t"; - // img_in++; - // } - // std::cout << std::endl; - // } - // } - // input_data += input_batch_stride; - // } - // std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" << - // std::endl; - // - // padded_input_data = padded_input->data(); - // std::cout << "******************Padding begin**********************" - // << std::endl; - // for (int i = 0; i < 1; ++i) { - // for (int j = 0; j < 1; ++j) { - // T *img_padded = padded_input_data + j * padded_channel_stride; - // for (int k = 0; k < h_padded; ++k) { - // for (int l = 0; l < w_padded; ++l) { - // std::cout << (int32_t)*img_padded << "\t"; - // img_padded++; - // } - // std::cout << std::endl; - // } - // } - // padded_input_data += padded_batch_stride; - // } - // std::cout << "******************Padding end**********************" - // << std::endl; -} -void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h, - int32_t pad_w) { - Tensor padded_input; - if (pad_h != 0 && pad_w != 0) { - int8_t value = -SCHAR_MAX; - make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value); - input = &padded_input; - } - const int32_t batch_size = input->dims()[0]; - const int32_t h_in = input->dims()[2]; - const int32_t w_in = input->dims()[3]; - const int8_t *input_data = input->data(); - const int32_t output_channels = output->dims()[1]; - const int32_t h_out = output->dims()[2]; - const int32_t w_out = output->dims()[3]; - int8_t *output_data = output->mutable_data(); - const int32_t outputdata_channel_stride = h_out * w_out; - const int32_t inputdata_channel_stride = h_in * w_in; - const int32_t input_batch_stride = output_channels * inputdata_channel_stride; - const int32_t output_batch_stride = - output_channels * outputdata_channel_stride; - // std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl; - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int j = 0; j < output_channels; ++j) { - const int8_t *img_in = input_data + j * inputdata_channel_stride; - int8_t *img_out = output_data + j * outputdata_channel_stride; - for (int k = 0; k < h_out; ++k) { - const int8_t *row0 = img_in + k * w_in; - const int8_t *row1 = img_in + (k + 1) * w_in; - const int8_t *row2 = img_in + (k + 2) * w_in; -#if __ARM_NEON - int32_t nw = w_out >> 4; - int32_t left_w = w_out & 0xf; - int32_t nw1 = left_w >> 3; - int32_t left_w1 = left_w & 0x7; -#if __aarch64__ - // TODO -#else - if (nw > 0) { -#define LOOP_LABEL "1" - // result: q15 - asm volatile( - "vld1.8 {q0}, [%[row0]]! \n\t" // q0=0-15 - "vld1.8 {q2}, [%[row1]]! \n\t" - "vld1.8 {q4}, [%[row2]]! \n\t" - - LOOP_LABEL - ": \n\t" - "vld1.8 {q1}, [%[row0]]! \n\t" // q1=16-31 - "vext.8 q6, q0, q1, #1 \n\t" - "vext.8 q7, q0, q1, #2 \n\t" - "vld1.8 {q3}, [%[row1]]! \n\t" - "vmax.s8 q15, q0, q6 \n\t" - "vmax.s8 q15, q15, q7 \n\t" - "vext.8 q6, q2, q3, #1 \n\t" - "vext.8 q7, q2, q3, #2 \n\t" - "vld1.8 {q5}, [%[row2]]! \n\t" - "vmax.s8 q14, q2, q6 \n\t" - "vmax.s8 q14, q14, q7 \n\t" - "vext.8 q6, q4, q5, #1 \n\t" - "vext.8 q7, q4, q5, #2 \n\t" - "vmax.s8 q13, q4, q6 \n\t" - "vmax.s8 q13, q13, q7 \n\t" - "vmax.s8 q15, q15, q14 \n\t" - "vmax.s8 q15, q15, q13 \n\t" - "vmov.s8 q0, q1 \n\t" - "vmov.s8 q2, q3 \n\t" - "vmov.s8 q4, q5 \n\t" - "vst1.8 {q15}, [%[img_out]]! \n\t" - "subs %[nw], #1 \n\t" - "bne " LOOP_LABEL - "b \n\t" - "sub %[row0], #16 \n\t" - "sub %[row1], #16 \n\t" - "sub %[row2], #16 \n\t" - : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1), - [row2] "+r"(row2), [img_out] "+r"(img_out) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q13", "q14", "q15"); -#undef LOOP_LABEL - } - if (nw1 > 0 || left_w1 > 0) { -#define PADDLE_LABEL_LESS8 "1" -#define PADDLE_LABEL_LESS8_SAVE "2" -#define PADDLE_LABEL_OVER "3" - // result: d15 - asm volatile( - "vld1.8 {d0}, [%[row0]]! \n\t" // d0=0-8 - "vld1.8 {d2}, [%[row1]]! \n\t" - "vld1.8 {d4}, [%[row2]]! \n\t" - "mov r0, #1 \n\t" - "cmp %[nw1], #0 \n\t" - "beq " PADDLE_LABEL_LESS8 - "f\n\t" - "vld1.8 {d1}, [%[row0]]! \n\t" // d1=9-15 - "vext.8 d6, d0, d1, #1 \n\t" - "vext.8 d7, d0, d1, #2 \n\t" - "vld1.8 {d3}, [%[row1]]! \n\t" - "vmax.s8 d15, d0, d6 \n\t" - "vmax.s8 d15, d15, d7 \n\t" - "vext.8 d6, d2, d3, #1 \n\t" - "vext.8 d7, d2, d3, #2 \n\t" - "vld1.8 {d5}, [%[row2]]! \n\t" - "vmax.s8 d14, d2, d6 \n\t" - "vmax.s8 d14, d14, d7 \n\t" - "vext.8 d6, d4, d5, #1 \n\t" - "vext.8 d7, d4, d5, #2 \n\t" - "vmax.s8 d13, d4, d6 \n\t" - "vmax.s8 d13, d13, d7 \n\t" - "vmax.s8 d15, d15, d14 \n\t" - "vmax.s8 d15, d15, d13 \n\t" - "vmov.s8 d0, d1 \n\t" - "vmov.s8 d2, d3 \n\t" - "vmov.s8 d4, d5 \n\t" - "vst1.8 {d15}, [%[img_out]]! \n\t" - - PADDLE_LABEL_LESS8 - ": \n\t" - "cmp %[left_w1], #0 \n\t" - "beq " PADDLE_LABEL_OVER - "f\n\t" - "vld1.8 {d1}, [%[row0]] \n\t" // d1=9-15 - "vext.8 d6, d0, d1, #1 \n\t" - "vext.8 d7, d0, d1, #2 \n\t" - "vld1.8 {d3}, [%[row1]] \n\t" - "vmax.s8 d15, d0, d6 \n\t" - "vmax.s8 d15, d15, d7 \n\t" - "vext.8 d6, d2, d3, #1 \n\t" - "vext.8 d7, d2, d3, #2 \n\t" - "vld1.8 {d5}, [%[row2]] \n\t" - "vmax.s8 d14, d2, d6 \n\t" - "vmax.s8 d14, d14, d7 \n\t" - "vext.8 d6, d4, d5, #1 \n\t" - "vext.8 d7, d4, d5, #2 \n\t" - "vmax.s8 d13, d4, d6 \n\t" - "vmax.s8 d13, d13, d7 \n\t" - "vmax.s8 d15, d15, d14 \n\t" - "vmax.s8 d15, d15, d13 \n\t" - - PADDLE_LABEL_LESS8_SAVE - ": \n\t" - "vst1.8 {d15[0]}, [%[img_out]], r0\n\t" - "add %[row0], %[row0], #1 \n\t" - "add %[row1], %[row1], #1 \n\t" - "add %[row2], %[row2], #1 \n\t" - "vext.8 d15, d15, d15, #1 \n\t" - "subs %[left_w1], #1 \n\t" - "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t" - - PADDLE_LABEL_OVER ": \n\t" - : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0), - [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out) - : - : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7", "d13", "d14", "d15"); -#undef PADDLE_LABEL_OVER -#undef PADDLE_LABEL_LESS8_SAVE -#undef PADDLE_LABEL_LESS8 - } -#endif // __aarch64__ -#else - int32_t left = w_out; - while (left > 0) { - const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]); - const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]); - const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]); - *img_out = std::max(std::max(max0, max1), max2); - row0 += 1; - row1 += 1; - row2 += 1; - img_out++; - left--; - } -#endif // __ARM_NEON - } - } - input_data += input_batch_stride; - output_data += output_batch_stride; - } -} -void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h, - int32_t pad_w) { - Tensor padded_input; - if (pad_h != 0 && pad_w != 0) { - int8_t value = -SCHAR_MAX; - make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value); - input = &padded_input; - } - const int32_t batch_size = input->dims()[0]; - const int32_t h_in = input->dims()[2]; - const int32_t w_in = input->dims()[3]; - const int32_t output_channels = output->dims()[1]; - const int32_t h_out = output->dims()[2]; - const int32_t w_out = output->dims()[3]; - const int32_t outputdata_channel_stride = h_out * w_out; - const int32_t inputdata_channel_stride = h_in * w_in; - const int32_t output_batch_stride = - output_channels * outputdata_channel_stride; - const int32_t input_batch_stride = output_channels * inputdata_channel_stride; - const int8_t *input_data = input->data(); - int8_t *output_data = output->mutable_data(); - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int j = 0; j < output_channels; ++j) { - const int8_t *img_in = input_data + j * inputdata_channel_stride; - int8_t *img_out = output_data + j * outputdata_channel_stride; - for (int k = 0; k < h_out; ++k) { - const int8_t *row0 = img_in + 2 * k * w_in; - const int8_t *row1 = img_in + (2 * k + 1) * w_in; - const int8_t *row2 = img_in + (2 * k + 2) * w_in; -#if __ARM_NEON - int32_t nw = w_out >> 4; - int32_t left_w = w_out & 0xf; - int32_t nw1 = left_w >> 3; - int32_t left_w1 = left_w & 0x7; -#if __aarch64__ - // TODO -#else - if (nw > 0) { -#define LOOP_LABEL "1" - // result: q15 - asm volatile( - "vld2.8 {q0, q1}, [%[row0]]! \n\t" // q0=0-30, q1=1-31 - "vld2.8 {q2, q3}, [%[row1]]! \n\t" - "vld2.8 {q4, q5}, [%[row2]]! \n\t" - - LOOP_LABEL - ": \n\t" - "vmax.s8 q15, q0, q1 \n\t" - "vld2.8 {q6, q7}, [%[row0]]! \n\t" // q0=32-62, q1=33-63 - "vmax.s8 q14, q2, q3 \n\t" - "vmax.s8 q13, q4, q5 \n\t" - "vld2.8 {q8, q9}, [%[row1]]! \n\t" - "vext.8 q0, q0, q6, #1 \n\t" - "vmax.s8 q15, q15, q0 \n\t" - "vld2.8 {q10, q11}, [%[row2]]! \n\t" - "vext.8 q2, q2, q8, #1 \n\t" - "vmax.s8 q14, q14, q2 \n\t" - "vext.8 q4, q4, q10, #1 \n\t" - "vmax.s8 q13, q13, q4 \n\t" - "vmax.s8 q15, q15, q14 \n\t" - "vmax.s8 q15, q15, q13 \n\t" - "vmov.s8 q0, q6 \n\t" - "vmov.s8 q1, q7 \n\t" - "vmov.s8 q2, q8 \n\t" - "vmov.s8 q3, q9 \n\t" - "vmov.s8 q4, q10 \n\t" - "vmov.s8 q5, q11 \n\t" - "vst1.8 {q15}, [%[img_out]]! \n\t" - "subs %[nw], #1 \n\t" - "bne " LOOP_LABEL - "b \n\t" - "sub %[row0], #32 \n\t" - "sub %[row1], #32 \n\t" - "sub %[row2], #32 \n\t" - : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1), - [row2] "+r"(row2), [img_out] "+r"(img_out) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q13", "q14", "q15"); -#undef LOOP_LABEL - } - if (nw1 > 0 || left_w1 > 0) { -#define PADDLE_LABEL_LESS8 "1" -#define PADDLE_LABEL_LESS8_SAVE "2" -#define PADDLE_LABEL_OVER "3" - // result: d15 - asm volatile( - "vld2.8 {d0, d1}, [%[row0]]! \n\t" // d0=0-14, d1=1-15 - "vld2.8 {d2, d3}, [%[row1]]! \n\t" - "vld2.8 {d4, d5}, [%[row2]]! \n\t" - "mov r0, #1 \n\t" - "cmp %[nw1], #0 \n\t" - "beq " PADDLE_LABEL_LESS8 - "f\n\t" - "vmax.s8 d15, d0, d1 \n\t" - "vld2.8 {d6, d7}, [%[row0]]! \n\t" // d0=32-62, d1=33-63 - "vmax.s8 d14, d2, d3 \n\t" - "vmax.s8 d13, d4, d5 \n\t" - "vld2.8 {d8, d9}, [%[row1]]! \n\t" - "vext.8 d0, d0, d6, #1 \n\t" - "vmax.s8 d15, d15, d0 \n\t" - "vld2.8 {d10, d11}, [%[row2]]! \n\t" - "vext.8 d2, d2, d8, #1 \n\t" - "vmax.s8 d14, d14, d2 \n\t" - "vext.8 d4, d4, d10, #1 \n\t" - "vmax.s8 d13, d13, d4 \n\t" - "vmax.s8 d15, d15, d14 \n\t" - "vmax.s8 d15, d15, d13 \n\t" - "vmov.s8 d0, d6 \n\t" - "vmov.s8 d1, d7 \n\t" - "vmov.s8 d2, d8 \n\t" - "vmov.s8 d3, d9 \n\t" - "vmov.s8 d4, d10 \n\t" - "vmov.s8 d5, d11 \n\t" - "vst1.8 {d15}, [%[img_out]]! \n\t" - - PADDLE_LABEL_LESS8 - ": \n\t" - "cmp %[left_w1], #0 \n\t" - "beq " PADDLE_LABEL_OVER - "f\n\t" - "vmax.s8 d15, d0, d1 \n\t" - "vld2.8 {d6, d7}, [%[row0]] \n\t" // d0=32-62, d1=33-63 - "vmax.s8 d14, d2, d3 \n\t" - "vmax.s8 d13, d4, d5 \n\t" - "vld2.8 {d8, d9}, [%[row1]] \n\t" - "vext.8 d0, d0, d6, #1 \n\t" - "vmax.s8 d15, d15, d0 \n\t" - "vld2.8 {d10, d11}, [%[row2]] \n\t" - "vext.8 d2, d2, d8, #1 \n\t" - "vmax.s8 d14, d14, d2 \n\t" - "vext.8 d4, d4, d10, #1 \n\t" - "vmax.s8 d13, d13, d4 \n\t" - "vmax.s8 d15, d15, d14 \n\t" - "vmax.s8 d15, d15, d13 \n\t" - - PADDLE_LABEL_LESS8_SAVE - ": \n\t" - "vst1.8 {d15[0]}, [%[img_out]], r0\n\t" - "add %[row0], %[row0], #2 \n\t" - "add %[row1], %[row1], #2 \n\t" - "add %[row2], %[row2], #2 \n\t" - "vext.8 d15, d15, d15, #1 \n\t" - "subs %[left_w1], #1 \n\t" - "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t" - - PADDLE_LABEL_OVER ": \n\t" - : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0), - [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out) - : - : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15"); -#undef PADDLE_LABEL_OVER -#undef PADDLE_LABEL_LESS8_SAVE -#undef PADDLE_LABEL_LESS8 - } -#endif // __aarch64__ -#else - int32_t left = w_out; - while (left > 0) { - const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]); - const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]); - const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]); - *img_out = std::max(std::max(max0, max1), max2); - row0 += 2; - row1 += 2; - row2 += 2; - img_out++; - left--; - } -#endif // __ARM_NEON - } - } - input_data += input_batch_stride; - output_data += output_batch_stride; - } -} -void Pool3x3Max_int8(const vector &strides, const vector &paddings, - const Tensor *input, Tensor *output) { - const int batch_size = input->dims()[0]; - const int input_height = input->dims()[2]; - const int input_width = input->dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - // const int _kernel_size = 3; - const int stride = strides[0]; - // const int stride_width = strides[1]; - const int padding = paddings[0]; - // const int padding_width = paddings[1]; - const int8_t negative_max = -SCHAR_MAX; - const int input_channel_stride = input_height * input_width; - const int output_channel_stride = output_height * output_width; - const int8_t *input_data = input->data(); - int8_t *output_data = output->mutable_data(); - const int input_batch_stride = output_channels * input_channel_stride; - const int output_batch_stride = output_channels * output_channel_stride; - for (int i = 0; i < batch_size; ++i) { -#pragma omp parallel for - for (int c = 0; c < output_channels; ++c) { - const int8_t *input_seg = input_data + c * input_channel_stride; - int8_t *output_seg = output_data + c * output_channel_stride; - for (int ph = 0; ph < output_height; ph++) { - int hstart = ph * stride - padding; - int hend = min(hstart + 3, input_height); - hstart = max(hstart, 0); - for (int pw = 0; pw < output_width; pw++) { - int wstart = pw * stride - padding; - int wend = min(wstart + 3, input_width); - wstart = max(wstart, 0); - const int8_t *pos1 = input_seg + hstart * input_width + wstart; - const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart; - const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart; - int8_t *output_ptr = output_seg + ph * output_width + pw; - if (hend - hstart != 3 || wend - wstart != 3) { - int8_t max_value = -SCHAR_MAX; - for (int h = hstart; h < hend; h++) { - for (int w = wstart; w < wend; w++) { - int8_t value = input_seg[h * input_width + w]; - if (value > max_value) { - max_value = value; - } - } - } - output_seg[ph * output_width + pw] = max_value; - } else { -#if __ARM_NEON -#if __aarch64__ - // TODO -#else - asm volatile( - "vld1.8 {d0}, [%[pos1]] \n\t" - "vld1.8 {d1}, [%[pos2]] \n\t" - "vld1.8 {d2}, [%[pos3]] \n\t" - "vmax.s8 d3, d0, d1 \n\t" - "vmax.s8 d4, d2, d3 \n\t" - "vmov.s8 d4[3], %[negative_max] \n\t" - "vpmax.s8 d5, d4, d4 \n\t" - "vpmax.s8 d6, d5, d5 \n\t" - "vst1.8 {d6[0]},[%[output_ptr]] \n\t" - : - : [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3), - [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) - : "memory", "q0", "q1", "q2", "q3"); -#endif -#else - const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]); - const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]); - const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]); - *output_ptr = std::max(std::max(max0, max1), max2); -#endif // __ARM_NEON - } - } - } - } - input_data += input_batch_stride; - output_data += output_batch_stride; - } -} -} // namespace math -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp index f2be3f29db..3bb6c27c5a 100644 --- a/src/operators/math/pooling.cpp +++ b/src/operators/math/pooling.cpp @@ -15,87 +15,68 @@ limitations under the License. */ #ifdef POOL_OP #include "operators/math/pooling.h" -#include -#include -#include "common/types.h" -#ifdef _OPENMP -#include -#endif - namespace paddle_mobile { namespace operators { namespace math { -/* - * All tensors are in NCHW format. - * Ksize, strides, paddings are two elements. These two elements represent - * height and width, respectively. - */ -template -class PoolFunctor { - public: - void operator()(const framework::Tensor &input, const std::vector &ksize, - const std::vector &strides, - const std::vector &paddings, PoolProcess pool_process, - framework::Tensor *output) { - const int batch_size = input.dims()[0]; - - const int input_height = input.dims()[2]; - - const int input_width = input.dims()[3]; - - const int output_channels = output->dims()[1]; - - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = ksize[0]; - const int ksize_width = ksize[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T *input_data = input.data(); - T *output_data = output->mutable_data(); - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - #pragma omp parallel for - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - - auto ele = pool_process.initial(); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - pool_process.compute(input_data[h * input_width + w], &ele); - } +template +void Pooling

::operator()(const framework::Tensor &input, + const std::vector &kernel_size, + const std::vector &strides, + const std::vector &paddings, + framework::Tensor *output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = kernel_size[0]; + const int ksize_width = kernel_size[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const float *input_data = input.data(); + float *output_data = output->mutable_data(); + const size_t input_spatial_size = input_height * input_width; + const size_t output_spatial_size = output_height * output_width; + + #pragma omp parallel for collapse(2) + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + int channel = i * output_channels + c; + const float *input_ptr = input_data + channel * input_spatial_size; + float *output_ptr = output_data + channel * output_spatial_size; + + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + PoolingVal

val; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + val += input_ptr[h * input_width + w]; } - int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(static_cast(pool_size), &ele); - output_data[ph * output_width + pw] = static_cast(ele); } + output_data[ph * output_width + pw] = val.Value(); } - input_data += input_stride; - output_data += output_stride; } } } -}; +} + +template struct Pooling; +template struct Pooling; -template class PoolFunctor, float>; -template class PoolFunctor, float>; -template class PoolFunctor, int8_t>; -template class PoolFunctor, int8_t>; } // namespace math } // namespace operators } // namespace paddle_mobile -#endif +#endif // POOL_OP diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h index 4d94550cc3..9407270a47 100644 --- a/src/operators/math/pooling.h +++ b/src/operators/math/pooling.h @@ -16,75 +16,143 @@ limitations under the License. */ #pragma once -#include +#include #include -#include "common/log.h" +#include +#include +#include "common/types.h" #include "framework/tensor.h" -#include "pool_2x2.h" -#include "pool_3x3.h" +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif namespace paddle_mobile { namespace operators { namespace math { -#define FLT_MAX __FLT_MAX__ - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - inline T initial() { - if (typeid(T) == typeid(int8_t)) { - return static_cast(-SCHAR_MAX); +template +struct PoolingVal { + float val; + int count; + PoolingVal() { + val = std::numeric_limits::min(); + count = 0; + } + inline PoolingVal

&operator+=(const float &x) { + val = std::max(val, x); + count += 1; + return *this; + } + float Value() const { + if (count > 0) { + return val; } - return static_cast(-FLT_MAX); + return 0.f; } - - inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; } - - inline void finalize(const T &pool_field, T *y) {} }; -template -class AvgPool { - public: - inline Otype initial() { return static_cast(0); } - - inline void compute(const Itype &x, Otype *y) { *y += x; } - - inline void finalize(const float &pool_field, Otype *y) { - if (typeid(Itype) == typeid(int8_t)) { - float tmp = *y / pool_field; - if (tmp > SCHAR_MAX) { - *y = SCHAR_MAX; - } else if (tmp < -SCHAR_MAX) { - *y = -SCHAR_MAX; - } else { - *y = static_cast(std::round(tmp)); - } - } else { - *y /= pool_field; +template <> +struct PoolingVal { + float val; + int count; + PoolingVal() { + val = 0.f; + count = 0; + } + inline PoolingVal &operator+=(const float &x) { + val += x; + count += 1; + return *this; + } + float Value() const { + if (count > 0) { + return val / count; } + return 0.f; } }; -template -class PoolFunctor { - public: - void operator()(const framework::Tensor &input, const std::vector &ksize, - const std::vector &strides, - const std::vector &paddings, PoolProcess pool_compute, - framework::Tensor *output); +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +template +inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) { + return vmaxq_f32(x1, x2); +} + +template <> +inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, + const float32x4_t &x2) { + return vaddq_f32(x1, x2); +} + +template +inline float32x4_t vPoolPostq_f32(const float32x4_t &x) { + return x; +} + +template <> +inline float32x4_t vPoolPostq_f32(const float32x4_t &x) { + float32x4_t avg = vdupq_n_f32(1.f / 9); + return vmulq_f32(avg, x); +} +#endif // __ARM_NEON__ + +template +inline float PoolPre(const float &x1, const float &x2) { + return std::max(x1, x2); +} + +template <> +inline float PoolPre(const float &x1, const float &x2) { + return x1 + x2; +} + +template +inline float PoolPost(const float &x) { + return x; +} + +template <> +inline float PoolPost(const float &x) { + return 1.f / 9 * x; +} + +template +struct Pooling { + inline void operator()(const framework::Tensor &input, + const std::vector &kernel_size, + const std::vector &strides, + const std::vector &paddings, + framework::Tensor *output); +}; + +template +struct Pooling2x2 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output); +}; + +template +struct Pooling3x3 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output); +}; + +template +struct Pooling5x5 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output); +}; + +template +struct Pooling7x7 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output); }; + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pooling3x3.cpp b/src/operators/math/pooling3x3.cpp new file mode 100644 index 0000000000..3918001d76 --- /dev/null +++ b/src/operators/math/pooling3x3.cpp @@ -0,0 +1,819 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP + +#include "operators/math/pooling.h" +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif // __ARM_NEON + +namespace paddle_mobile { +namespace operators { +namespace math { + +#define POOLING3X3_NORMAL_BORDER(start, end) \ + for (int w = start; w < end; ++w) { \ + const int w_in_start = -padding_w + w * Stride; \ + const int w_in_end = w_in_start + 3; \ + const int w_start = w_in_start > 0 ? w_in_start : 0; \ + const int w_end = w_in_end < input_w ? w_in_end : input_w; \ + PoolingVal

val; \ + for (int h_in = h_start; h_in < h_end; ++h_in) { \ + for (int w_in = w_start; w_in < w_end; ++w_in) { \ + val += input[h_in * input_w + w_in]; \ + } \ + } \ + output_ptr[w] = val.Value(); \ + } + +template +inline void Pooling3x3ValidCol(const float *input, const int h_output, + const int h_output_end, const int w_output, + const int input_h, const int input_w, + const int padding_h, const int padding_w, + const int output_w, float *output) { + const int w_in_start = -padding_w + w_output * Stride; + const int w_in_end = w_in_start + 3; + const int w_start = w_in_start > 0 ? w_in_start : 0; + const int w_end = w_in_end < input_w ? w_in_end : input_w; + for (int h = h_output; h < h_output_end; ++h) { + PoolingVal

val; + const int h_in_start = -padding_h + h * Stride; + for (int i = 0; i < 3; ++i) { + for (int w_in = w_start; w_in < w_end; ++w_in) { + val += input[(h_in_start + i) * input_w + w_in]; + } + } + output[h * output_w + w_output] = val.Value(); + } +} + +template +inline void Pooling3x3NormalRow(const float *input, const int h_output, + const int input_h, const int input_w, + const int padding_h, const int padding_w, + const int output_w, float *output) { + const int h_in_start = -padding_h + h_output * Stride; + const int h_in_end = h_in_start + 3; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int h_end = h_in_end < input_h ? h_in_end : input_h; + + int valid_w_start = (padding_w + Stride - 1) / Stride; + int valid_w_end = output_w - valid_w_start; + + float *output_ptr = output + h_output * output_w; + // border left + POOLING3X3_NORMAL_BORDER(0, valid_w_start) + // middle + for (int w = valid_w_start; w < valid_w_end; ++w) { + PoolingVal

val; + int input_start = -padding_w + w * Stride; + for (int h_in = h_start; h_in < h_end; ++h_in) { + for (int j = 0; j < 3; ++j) { + val += input[h_in * input_w + j + input_start]; + } + } + output_ptr[w] = val.Value(); + } + // border right + POOLING3X3_NORMAL_BORDER(valid_w_end, output_w) +} + +template +struct Pooling3x3 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output) { + const float *input_data = input.data(); + float *output_data = output->mutable_data(); + int input_h = input.dims()[2]; + int input_w = input.dims()[3]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; + int image_size = input_h * input_w; + int out_image_size = output_h * output_w; + int valid_h_start = padding_h; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = padding_w; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + + #pragma omp parallel for + for (int c = 0; c < output->dims()[1]; ++c) { + const float *input_ptr = input_data + c * image_size; + float *output_ptr = output_data + c * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, + padding_w, output_w, output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + Pooling3x3ValidCol(input_ptr, valid_h_start, valid_h_end, w, + input_h, input_w, padding_h, padding_w, + output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + Pooling3x3ValidCol(input_ptr, valid_h_start, valid_h_end, w, + input_h, input_w, padding_h, padding_w, + output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, + padding_w, output_w, output_ptr); + } + // valid + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { + const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + const float *input_ptr3 = input_ptr2 + input_w; + const float *input_ptr4 = input_ptr3 + input_w; + const float *input_ptr5 = input_ptr4 + input_w; + float *output_ptr0 = output_ptr + h * output_w + valid_w_start; + float *output_ptr1 = output_ptr0 + output_w; + float *output_ptr2 = output_ptr1 + output_w; + float *output_ptr3 = output_ptr2 + output_w; + int remain = output_w_remain; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + float32x4x2_t x0, x1, x2; + float32x4x2_t y0, y1, y2; + for (int loop = 0; loop < output_w_tiles; ++loop) { + x0.val[0] = vld1q_f32(input_ptr0); + x0.val[1] = vld1q_f32(input_ptr0 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + + x0.val[0] = vld1q_f32(input_ptr1); + x0.val[1] = vld1q_f32(input_ptr1 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); + + x0.val[0] = vld1q_f32(input_ptr2); + x0.val[1] = vld1q_f32(input_ptr2 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y2.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y2.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y1.val[0] = vPoolPreq_f32

(y2.val[0], y1.val[0]); + y1.val[1] = vPoolPreq_f32

(y2.val[1], y1.val[1]); + y0.val[0] = vPoolPreq_f32

(y2.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(y2.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr0, y0.val[0]); + vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); + + x0.val[0] = vld1q_f32(input_ptr3); + x0.val[1] = vld1q_f32(input_ptr3 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); + y1.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); + y2.val[0] = vPoolPreq_f32

(y0.val[0], y2.val[0]); + y2.val[1] = vPoolPreq_f32

(y0.val[1], y2.val[1]); + y1.val[0] = vPoolPostq_f32

(y1.val[0]); + y1.val[1] = vPoolPostq_f32

(y1.val[1]); + vst1q_f32(output_ptr1, y1.val[0]); + vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1])); + + x0.val[0] = vld1q_f32(input_ptr4); + x0.val[1] = vld1q_f32(input_ptr4 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); + y2.val[1] = vPoolPreq_f32

(x0.val[1], y2.val[1]); + y2.val[0] = vPoolPostq_f32

(y2.val[0]); + y2.val[1] = vPoolPostq_f32

(y2.val[1]); + vst1q_f32(output_ptr2, y2.val[0]); + vst1_f32(output_ptr2 + 4, vget_low_f32(y2.val[1])); + + x0.val[0] = vld1q_f32(input_ptr5); + x0.val[1] = vld1q_f32(input_ptr5 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr3, y0.val[0]); + vst1_f32(output_ptr3 + 4, vget_low_f32(y0.val[1])); + + input_ptr0 += 6; + input_ptr1 += 6; + input_ptr2 += 6; + input_ptr3 += 6; + input_ptr4 += 6; + input_ptr5 += 6; + output_ptr0 += 6; + output_ptr1 += 6; + output_ptr2 += 6; + output_ptr3 += 6; + } + // remain w + if (remain >= 4) { + x0.val[0] = vld1q_f32(input_ptr0); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + + x0.val[0] = vld1q_f32(input_ptr1); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); + + x0.val[0] = vld1q_f32(input_ptr2); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y2.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[0] = vPoolPreq_f32

(y2.val[0], y1.val[0]); + y0.val[0] = vPoolPreq_f32

(y2.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr0, y0.val[0]); + + x0.val[0] = vld1q_f32(input_ptr3); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); + y2.val[0] = vPoolPreq_f32

(y0.val[0], y2.val[0]); + y1.val[0] = vPoolPostq_f32

(y1.val[0]); + vst1q_f32(output_ptr1, y1.val[0]); + + x0.val[0] = vld1q_f32(input_ptr4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); + y2.val[0] = vPoolPostq_f32

(y2.val[0]); + vst1q_f32(output_ptr2, y2.val[0]); + + x0.val[0] = vld1q_f32(input_ptr5); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr3, y0.val[0]); + + input_ptr0 += 4; + input_ptr1 += 4; + input_ptr2 += 4; + input_ptr3 += 4; + input_ptr4 += 4; + input_ptr5 += 4; + output_ptr0 += 4; + output_ptr1 += 4; + output_ptr2 += 4; + output_ptr3 += 4; + remain -= 4; + } +#endif // __ARM_NEON__ + for (int r = 0; r < remain; ++r) { + float m0 = PoolPre

(input_ptr0[r], input_ptr0[r + 1]); + m0 = PoolPre

(m0, input_ptr0[r + 2]); + float m1 = PoolPre

(input_ptr1[r], input_ptr1[r + 1]); + m1 = PoolPre

(m1, input_ptr1[r + 2]); + float m2 = PoolPre

(input_ptr2[r], input_ptr2[r + 1]); + m2 = PoolPre

(m2, input_ptr2[r + 2]); + float m3 = PoolPre

(input_ptr3[r], input_ptr3[r + 1]); + m3 = PoolPre

(m3, input_ptr3[r + 2]); + float m4 = PoolPre

(input_ptr4[r], input_ptr4[r + 1]); + m4 = PoolPre

(m4, input_ptr4[r + 2]); + float m5 = PoolPre

(input_ptr5[r], input_ptr5[r + 1]); + m5 = PoolPre

(m5, input_ptr5[r + 2]); + + m0 = PoolPre

(PoolPre

(m0, m1), m2); + m1 = PoolPre

(PoolPre

(m1, m2), m3); + m2 = PoolPre

(PoolPre

(m2, m3), m4); + m3 = PoolPre

(PoolPre

(m3, m4), m5); + output_ptr0[r] = PoolPost

(m0); + output_ptr1[r] = PoolPost

(m1); + output_ptr2[r] = PoolPost

(m2); + output_ptr3[r] = PoolPost

(m3); + } + } + // remain h + int start_h = valid_h_start + (valid_h & 0xFFFC); + for (int h = start_h; h < valid_h_end; ++h) { + const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + float *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int remain = output_w_remain; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + float32x4x2_t x0, x1, x2, y0; + for (int loop = 0; loop < output_w_tiles; ++loop) { + x0.val[0] = vld1q_f32(input_ptr0); + x0.val[1] = vld1q_f32(input_ptr0 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + + x0.val[0] = vld1q_f32(input_ptr1); + x0.val[1] = vld1q_f32(input_ptr1 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + + x0.val[0] = vld1q_f32(input_ptr2); + x0.val[1] = vld1q_f32(input_ptr2 + 4); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x1.val[1] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x2.val[1] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr0, y0.val[0]); + vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); + + input_ptr0 += 6; + input_ptr1 += 6; + input_ptr2 += 6; + output_ptr0 += 6; + } + // remain w + if (remain >= 4) { + x0.val[0] = vld1q_f32(input_ptr0); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + + x0.val[0] = vld1q_f32(input_ptr1); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + + x0.val[0] = vld1q_f32(input_ptr2); + x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); + x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr0, y0.val[0]); + + input_ptr0 += 4; + input_ptr1 += 4; + input_ptr2 += 4; + output_ptr0 += 4; + remain -= 4; + } +#endif // __ARM_NEON__ + for (int r = 0; r < remain; ++r) { + float m0 = PoolPre

(input_ptr0[r], input_ptr0[r + 1]); + m0 = PoolPre

(m0, input_ptr0[r + 2]); + float m1 = PoolPre

(input_ptr1[r], input_ptr1[r + 1]); + m1 = PoolPre

(m1, input_ptr1[r + 2]); + float m2 = PoolPre

(input_ptr2[r], input_ptr2[r + 1]); + m2 = PoolPre

(m2, input_ptr2[r + 2]); + + m0 = PoolPre

(PoolPre

(m0, m1), m2); + output_ptr0[r] = PoolPost

(m0); + } + } + } + } +}; + +template +struct Pooling3x3 { + inline void operator()(const framework::Tensor &input, + const std::vector &paddings, + framework::Tensor *output) { + const float *input_data = input.data(); + float *output_data = output->mutable_data(); + int input_h = input.dims()[2]; + int input_w = input.dims()[3]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; + int image_size = input_h * input_w; + int out_image_size = output_h * output_w; + int valid_h_start = (padding_h + 1) / 2; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = (padding_w + 1) / 2; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + + #pragma omp parallel for + for (int c = 0; c < output->dims()[1]; ++c) { + const float *input_ptr = input_data + c * image_size; + float *output_ptr = output_data + c * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, + padding_w, output_w, output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + Pooling3x3ValidCol(input_ptr, valid_h_start, valid_h_end, w, + input_h, input_w, padding_h, padding_w, + output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + Pooling3x3ValidCol(input_ptr, valid_h_start, valid_h_end, w, + input_h, input_w, padding_h, padding_w, + output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, + padding_w, output_w, output_ptr); + } + // valid + int input_w_start = 2 * valid_w_start - padding_w; + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 2; h += 3) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const float *input_ptr0 = input_ptr + offset; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + const float *input_ptr3 = input_ptr2 + input_w; + const float *input_ptr4 = input_ptr3 + input_w; + const float *input_ptr5 = input_ptr4 + input_w; + const float *input_ptr6 = input_ptr5 + input_w; + float *output_ptr0 = output_ptr + h * output_w + valid_w_start; + float *output_ptr1 = output_ptr0 + output_w; + float *output_ptr2 = output_ptr1 + output_w; + int remain = output_w_remain; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + float32x4x2_t x0, x1, x2; + float32x4x2_t y0, y1, y2; + for (int loop = 0; loop < output_w_tiles; ++loop) { + x0 = vld2q_f32(input_ptr0); + x1 = vld2q_f32(input_ptr0 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + + x0 = vld2q_f32(input_ptr1); + x1 = vld2q_f32(input_ptr1 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + + x0 = vld2q_f32(input_ptr2); + x1 = vld2q_f32(input_ptr2 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr0, y0.val[0]); + vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); + + x0 = vld2q_f32(input_ptr3); + x1 = vld2q_f32(input_ptr3 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], y1.val[0]); + y1.val[1] = vPoolPreq_f32

(x0.val[1], y1.val[1]); + + x0 = vld2q_f32(input_ptr4); + x1 = vld2q_f32(input_ptr4 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); + y1.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); + y1.val[0] = vPoolPostq_f32

(y1.val[0]); + y1.val[1] = vPoolPostq_f32

(y1.val[1]); + vst1q_f32(output_ptr1, y1.val[0]); + vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1])); + + x0 = vld2q_f32(input_ptr5); + x1 = vld2q_f32(input_ptr5 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + + x0 = vld2q_f32(input_ptr6); + x1 = vld2q_f32(input_ptr6 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr2, y0.val[0]); + vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1])); + + input_ptr0 += 12; + input_ptr1 += 12; + input_ptr2 += 12; + input_ptr3 += 12; + input_ptr4 += 12; + input_ptr5 += 12; + output_ptr0 += 6; + output_ptr1 += 6; + output_ptr2 += 6; + } + // remain w + if (remain >= 4) { + x0 = vld2q_f32(input_ptr0); + x1.val[0] = vdupq_n_f32(input_ptr0[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + + x0 = vld2q_f32(input_ptr1); + x1.val[0] = vdupq_n_f32(input_ptr1[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + + x0 = vld2q_f32(input_ptr2); + x1.val[0] = vdupq_n_f32(input_ptr2[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr0, y0.val[0]); + + x0 = vld2q_f32(input_ptr3); + x1.val[0] = vdupq_n_f32(input_ptr3[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[0] = vPoolPreq_f32

(x0.val[0], y1.val[0]); + + x0 = vld2q_f32(input_ptr4); + x1.val[0] = vdupq_n_f32(input_ptr4[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); + y1.val[0] = vPoolPostq_f32

(y1.val[0]); + vst1q_f32(output_ptr1, y1.val[0]); + + x0 = vld2q_f32(input_ptr5); + x1.val[0] = vdupq_n_f32(input_ptr5[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + + x0 = vld2q_f32(input_ptr6); + x1.val[0] = vdupq_n_f32(input_ptr6[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr2, y0.val[0]); + + input_ptr0 += 8; + input_ptr1 += 8; + input_ptr2 += 8; + input_ptr3 += 8; + input_ptr4 += 8; + input_ptr5 += 8; + output_ptr0 += 4; + output_ptr1 += 4; + output_ptr2 += 4; + remain -= 4; + } +#endif // __ARM_NEON__ + for (int r = 0; r < remain; ++r) { + float m0 = PoolPre

(input_ptr0[2 * r], input_ptr0[2 * r + 1]); + m0 = PoolPre

(m0, input_ptr0[2 * r + 2]); + float m1 = PoolPre

(input_ptr1[2 * r], input_ptr1[2 * r + 1]); + m1 = PoolPre

(m1, input_ptr1[2 * r + 2]); + float m2 = PoolPre

(input_ptr2[2 * r], input_ptr2[2 * r + 1]); + m2 = PoolPre

(m2, input_ptr2[2 * r + 2]); + float m3 = PoolPre

(input_ptr3[2 * r], input_ptr3[2 * r + 1]); + m3 = PoolPre

(m3, input_ptr3[2 * r + 2]); + float m4 = PoolPre

(input_ptr4[2 * r], input_ptr4[2 * r + 1]); + m4 = PoolPre

(m4, input_ptr4[2 * r + 2]); + float m5 = PoolPre

(input_ptr5[2 * r], input_ptr5[2 * r + 1]); + m5 = PoolPre

(m5, input_ptr5[2 * r + 2]); + float m6 = PoolPre

(input_ptr6[2 * r], input_ptr6[2 * r + 1]); + m6 = PoolPre

(m6, input_ptr6[2 * r + 2]); + + m0 = PoolPre

(PoolPre

(m0, m1), m2); + m1 = PoolPre

(PoolPre

(m2, m3), m4); + m2 = PoolPre

(PoolPre

(m4, m5), m6); + output_ptr0[r] = PoolPost

(m0); + output_ptr1[r] = PoolPost

(m1); + output_ptr2[r] = PoolPost

(m2); + } + } + // remain h + int start_h = valid_h_start + valid_h / 3 * 3; + for (int h = start_h; h < valid_h_end; ++h) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const float *input_ptr0 = input_ptr + offset; + const float *input_ptr1 = input_ptr0 + input_w; + const float *input_ptr2 = input_ptr1 + input_w; + float *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int remain = output_w_remain; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + float32x4x2_t x0, x1, x2, y0; + for (int loop = 0; loop < output_w_tiles; ++loop) { + x0 = vld2q_f32(input_ptr0); + x1 = vld2q_f32(input_ptr0 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + + x0 = vld2q_f32(input_ptr1); + x1 = vld2q_f32(input_ptr1 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + + x0 = vld2q_f32(input_ptr2); + x1 = vld2q_f32(input_ptr2 + 8); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + y0.val[1] = vPoolPostq_f32

(y0.val[1]); + vst1q_f32(output_ptr0, y0.val[0]); + vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); + + input_ptr0 += 12; + input_ptr1 += 12; + input_ptr2 += 12; + output_ptr0 += 6; + } + // remain w + if (remain >= 4) { + x0 = vld2q_f32(input_ptr0); + x1.val[0] = vdupq_n_f32(input_ptr0[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + + x0 = vld2q_f32(input_ptr1); + x1.val[0] = vdupq_n_f32(input_ptr1[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + + x0 = vld2q_f32(input_ptr2); + x1.val[0] = vdupq_n_f32(input_ptr2[8]); + x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); + x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); + y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); + y0.val[0] = vPoolPostq_f32

(y0.val[0]); + vst1q_f32(output_ptr0, y0.val[0]); + + input_ptr0 += 8; + input_ptr1 += 8; + input_ptr2 += 8; + output_ptr0 += 4; + remain -= 4; + } +#endif // __ARM_NEON__ + for (int r = 0; r < remain; ++r) { + float m0 = PoolPre

(input_ptr0[2 * r], input_ptr0[2 * r + 1]); + m0 = PoolPre

(m0, input_ptr0[2 * r + 2]); + float m1 = PoolPre

(input_ptr1[2 * r], input_ptr1[2 * r + 1]); + m1 = PoolPre

(m1, input_ptr1[2 * r + 2]); + float m2 = PoolPre

(input_ptr2[2 * r], input_ptr2[2 * r + 1]); + m2 = PoolPre

(m2, input_ptr2[2 * r + 2]); + + m0 = PoolPre

(PoolPre

(m0, m1), m2); + output_ptr0[r] = PoolPost

(m0); + } + } + } + } +}; + +template struct Pooling3x3; +template struct Pooling3x3; +template struct Pooling3x3; +template struct Pooling3x3; + +} // namespace math +} // namespace operators +} // namespace paddle_mobile + +#endif // POOL_OP diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp index 5784ac0654..ae5ff9d3f7 100644 --- a/test/operators/test_pool_op.cpp +++ b/test/operators/test_pool_op.cpp @@ -14,10 +14,13 @@ limitations under the License. */ #include #include "../test_include.h" -#include "operators/kernel/central-arm-func/pool_arm_func.h" +#include "operators/math/pooling.h" #include "operators/pool_op.h" namespace paddle_mobile { + +namespace math = operators::math; + static int PoolOutputSize(int input_size, int filter_size, int padding, int stride, bool ceil_mode) { int output_size; @@ -30,70 +33,6 @@ static int PoolOutputSize(int input_size, int filter_size, int padding, return output_size; } -template -static void PoolAvgPad0(std::vector ksize, std::vector strides, - const framework::Tensor *input, - framework::Tensor *out) { - const int32_t batch_size = input->dims()[0]; - const int32_t input_c = input->dims()[1]; - const int32_t input_h = input->dims()[2]; - const int32_t input_w = input->dims()[3]; - const int32_t out_c = out->dims()[1]; - const int32_t out_h = out->dims()[2]; - const int32_t out_w = out->dims()[3]; - const int32_t kernel_h = ksize[0]; - const int32_t kernel_w = ksize[1]; - const int32_t stride_h = strides[0]; - const int32_t stride_w = strides[1]; - const int32_t inputdata_channel_stride = input_h * input_w; - const int32_t input_batch_stride = input_c * inputdata_channel_stride; - const int32_t outputdata_channel_stride = out_h * out_w; - const int32_t output_batch_stride = out_c * outputdata_channel_stride; - T *out_data = out->mutable_data(); - const T *input_data = input->data(); - const T **rows = new const T *[kernel_h]; - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < out_c; ++j) { - const T *img_in = input_data + j * inputdata_channel_stride; - T *img_out = out_data + j * outputdata_channel_stride; - for (int k = 0; k < out_h; ++k) { - for (int m = 0; m < kernel_h; ++m) { - rows[m] = img_in + (stride_h * k + m) * input_w; - } - int32_t left = out_w; - while (left > 0) { - float tmp = 0; - for (int m = 0; m < kernel_h; ++m) { - for (int l = 0; l < kernel_w; ++l) { - tmp += rows[m][l]; - } - } - if (typeid(T) == typeid(int8_t)) { - tmp = tmp / (kernel_h * kernel_w); - if (tmp < -127) { - *img_out = -127; - } else if (tmp > 127) { - *img_out = 127; - } else { - *img_out = static_cast(std::round(tmp)); - } - } else { - *img_out = static_cast(tmp / (kernel_h * kernel_w)); - } - for (int m = 0; m < kernel_h; ++m) { - rows[m] += stride_w; - } - img_out++; - left--; - } - } - } - input_data += input_batch_stride; - out_data += output_batch_stride; - } - delete[] rows; -} - template int TestPoolOp(int in_channels, int in_height, int in_width) { @@ -149,41 +88,27 @@ int TestPoolOp(int in_channels, int in_height, int in_width) { framework::Tensor output_cmp; output_cmp.mutable_data(output_shape); - if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) { - PoolAvgPad0(std::vector{kernel_h, kernel_w}, - std::vector{stride_h, stride_w}, input, &output_cmp); + + if (pooling_type == "avg") { + math::Pooling()(*input, std::vector{kernel_h, kernel_w}, + std::vector{stride_h, stride_w}, + std::vector{pad_h, pad_w}, &output_cmp); } else { - if (typeid(T) == typeid(int8_t)) { - operators::PoolBasic( - pooling_type, std::vector{kernel_h, kernel_w}, - std::vector{stride_h, stride_w}, std::vector{pad_h, pad_w}, - input, &output_cmp); - } else { - operators::PoolBasic( - pooling_type, std::vector{kernel_h, kernel_w}, - std::vector{stride_h, stride_w}, std::vector{pad_h, pad_w}, - input, &output_cmp); - } + math::Pooling()(*input, std::vector{kernel_h, kernel_w}, + std::vector{stride_h, stride_w}, + std::vector{pad_h, pad_w}, &output_cmp); } // compare results - int eq = 0; - int neq = 0; auto output = output_var->template Get(); const T *output_data = output->data(); T *output_cmp_data = output_cmp.data(); for (int i = 0; i < output->numel(); ++i) { PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - "The execution of test_pool_op is failed!"); - if (output_data[i] == output_cmp_data[i]) { - ++eq; - } else { - ++neq; - } + "output[%d] = %d, output_cmp[%d] = %d", i, + output_data[i], i, output_cmp_data[i]); } - std::cout << "eq = " << eq << ", neq = " << neq << std::endl; delete op; - return 0; } } // namespace paddle_mobile @@ -202,7 +127,6 @@ int main(int argc, char *argv[]) { int in_channels = atoi(argv[1]); int in_height = atoi(argv[2]); int in_width = atoi(argv[3]); -#if __ARM_NEON // kernel = 3, pad = 1, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"; @@ -213,67 +137,16 @@ int main(int argc, char *argv[]) { << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"; paddle_mobile::TestPoolOp(in_channels, in_height, in_width); -#endif - // kernel = 3, pad = 0, stride = 1 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 1, stride = 1 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 2, stride = 1 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 0, stride = 2 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 1, stride = 2 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 0, stride = 2 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 3, stride = 3 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 7, pad = 0, stride = 1 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 7, pad = 0, stride = 2 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 7, pad = 0, stride = 3 - LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 0, stride = 1 + // kernel = 5, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); - // kernel = 3, pad = 0, stride = 3 + << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 5, pad = 0, stride = 2 LOG(paddle_mobile::kLOG_INFO) - << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); + << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); // kernel = 7, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"; @@ -284,9 +157,4 @@ int main(int argc, char *argv[]) { << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4"; paddle_mobile::TestPoolOp(in_channels, in_height, in_width); - // kernel = 5, pad = 0, stride = 1 - LOG(paddle_mobile::kLOG_INFO) - << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"; - paddle_mobile::TestPoolOp(in_channels, in_height, - in_width); } -- GitLab