diff --git a/mace/kernels/neon/avg_pooling_neon_2x2.cc b/mace/kernels/neon/avg_pooling_neon_2x2.cc new file mode 100644 index 0000000000000000000000000000000000000000..586e3f4af990da52132c2f535512132657489dd9 --- /dev/null +++ b/mace/kernels/neon/avg_pooling_neon_2x2.cc @@ -0,0 +1,177 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include +#include + +#include "mace/core/common.h" + +namespace mace { +namespace kernels { + +void PoolingAvgNeonK2x2S2x2(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape, + const int *paddings) { + index_t batch = in_shape[0]; + index_t channels = in_shape[1]; + index_t in_height = in_shape[2]; + index_t in_width = in_shape[3]; + + index_t out_height = out_shape[2]; + index_t out_width = out_shape[3]; + + int padding_top = paddings[0] / 2; + int padding_bottom = paddings[0] - padding_top; + int padding_left = paddings[1] / 2; + int padding_right = paddings[1] - padding_left; + + int in_image_size = in_height * in_width; + int out_image_size = out_height * out_width; + index_t input_offset = 0; + index_t output_offset = 0; + float avg_factors[4] = {0.25, 0.25, 0.25, 0.25}; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + float *outptr = output + output_offset; + const float *r0, *r1; + + for (int h = 0; h < out_height; ++h) { + int w = 0; + int num_vectors = 0; + if (!((h == 0 && padding_top > 0) || + (h == out_height - 1 && padding_bottom > 0))) { + r0 = input + input_offset + (h * 2 - padding_top) * in_width; + r1 = r0 + in_width; + if (padding_left > 0) { + *outptr = (r0[0] + r1[0]) * 0.25; + ++r0; + ++r1; + ++outptr; + ++w; + } + if (padding_right > 0) { + num_vectors = (out_width - w - 1) >> 2; + } else { + num_vectors = (out_width - w) >> 2; + } + } + + w += num_vectors << 2; + float32x4_t factors = vld1q_f32(avg_factors); + for (; num_vectors > 0; --num_vectors) { + float32x4_t r00 = vld1q_f32(r0); + float32x4_t r10 = vld1q_f32(r1); + float32x4_t r01 = vld1q_f32(r0 + 4); + float32x4_t r11 = vld1q_f32(r1 + 4); + + float32x4_t sum0 = vaddq_f32(r00, r10); + float32x4_t sum1 = vaddq_f32(r01, r11); + + float32x4_t sum_result = vpaddq_f32(sum0, sum1); + float32x4_t avg_result = vmulq_f32(sum_result, factors); + + vst1q_f32(outptr, avg_result); + + r0 += 8; + r1 += 8; + outptr += 4; + } + + for (; w < out_width; ++w) { + float sum = 0.0; + for (int kh = 0; kh < 2; ++kh) { + for (int kw = 0; kw < 2; ++kw) { + int inh = h * 2 - padding_top + kh; + int inw = w * 2 - padding_left + kw; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + sum += input[input_offset + inh * in_width + inw]; + } + } + } + + *outptr = sum * 0.25; + ++outptr; + } + } + input_offset += in_image_size; + output_offset += out_image_size; + } + } +} + +// assume the input has already been padded +void PoolingAvgNeonK2x2S2x2Padded(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape) { + index_t batch = in_shape[0]; + index_t channels = in_shape[1]; + index_t in_height = in_shape[2]; + index_t in_width = in_shape[3]; + + index_t out_height = out_shape[2]; + index_t out_width = out_shape[3]; + + int in_image_size = in_height * in_width; + int out_image_size = out_height * out_width; + index_t input_offset = 0; + index_t output_offset = 0; + float avg_factors[4] = {0.25, 0.25, 0.25, 0.25}; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + const float *img0 = input + input_offset; + float *outptr = output + output_offset; + + const float *r0 = img0; + const float *r1 = img0 + in_width; + + for (int h = 0; h < out_height; ++h) { + int num_vectors = out_width >> 2; + int remain = out_width - (num_vectors << 2); + + float32x4_t factors = vld1q_f32(avg_factors); + for (; num_vectors > 0; --num_vectors) { + float32x4_t r00 = vld1q_f32(r0); + float32x4_t r10 = vld1q_f32(r1); + float32x4_t r01 = vld1q_f32(r0 + 4); + float32x4_t r11 = vld1q_f32(r1 + 4); + + float32x4_t sum0 = vaddq_f32(r00, r10); + float32x4_t sum1 = vaddq_f32(r01, r11); + + float32x4_t sum_result = vpaddq_f32(sum0, sum1); + float32x4_t avg_result = vmulq_f32(sum_result, factors); + + vst1q_f32(outptr, avg_result); + + r0 += 8; + r1 += 8; + outptr += 4; + } + + for (; remain > 0; --remain) { + *outptr = (r0[0] + r0[1] + r1[0] + r1[1]) * 0.25; + + r0 += 2; + r1 += 2; + outptr++; + } + r0 += in_width; + r1 += in_width; + } + input_offset += in_image_size; + output_offset += out_image_size; + } + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/neon/avg_pooling_neon_3x3.cc b/mace/kernels/neon/avg_pooling_neon_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c977f59daebf4a1e2a152439e4d6d8508be7a60 --- /dev/null +++ b/mace/kernels/neon/avg_pooling_neon_3x3.cc @@ -0,0 +1,223 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include +#include + +#include "mace/core/common.h" + +namespace mace { +namespace kernels { + +void PoolingAvgNeonK3x3S2x2(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape, + const int *paddings) { + index_t batch = in_shape[0]; + index_t channels = in_shape[1]; + index_t in_height = in_shape[2]; + index_t in_width = in_shape[3]; + + index_t out_height = out_shape[2]; + index_t out_width = out_shape[3]; + + int padding_top = paddings[0] / 2; + int padding_bottom = paddings[0] - padding_top; + int padding_left = paddings[1] / 2; + int padding_right = paddings[1] - padding_left; + + int in_image_size = in_height * in_width; + int out_image_size = out_height * out_width; + index_t input_offset = 0; + index_t output_offset = 0; + float avg_factors[4] = {1.0/9.0, 1.0/9.0, 1.0/9.0, 1.0/9.0}; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + float *outptr = output + output_offset; + + for (int h = 0; h < out_height; ++h) { + int w = 0; + int num_vectors = 0; + const float *r0, *r1, *r2; + if (!((h == 0 && padding_top > 0) || + (h == out_height - 1 && padding_bottom > 0))) { + r0 = input + input_offset + (h * 2 - padding_top) * in_width; + r1 = r0 + in_width; + r2 = r1 + in_width; + + if (padding_left > 0) { + if (padding_left == 1) { + float sum0 = std::max(r0[0], r0[1]); + float sum1 = std::max(r1[0], r1[1]); + float max2 = std::max(r2[0], r2[1]); + *outptr = (r0[0] + r0[1] + r1[0] + r1[1] + r2[0] + r2[1]) / 9.0; + ++r0; + ++r1; + } else { // padding_left == 2 + *outptr = (r0[0] + r1[0] + r2[0]) / 9.0; + } + ++outptr; + ++w; + } + if (padding_right > 0) { + num_vectors = (out_width - w - 1) >> 2; + } else { + num_vectors = (out_width - w) >> 2; + } + } + + w += num_vectors << 2; + float32x4_t factors = vld1q_f32(avg_factors); + float32x4x2_t row0 = vld2q_f32(r0); + float32x4x2_t row1 = vld2q_f32(r1); + float32x4x2_t row2 = vld2q_f32(r2); + for (; num_vectors > 0; --num_vectors) { + float32x4x2_t row0_next = vld2q_f32(r0 + 8); + float32x4x2_t row1_next = vld2q_f32(r1 + 8); + float32x4x2_t row2_next = vld2q_f32(r2 + 8); + + float32x4_t sum0 = vaddq_f32(row0.val[0], row0.val[1]); + float32x4_t sum1 = vaddq_f32(row1.val[0], row1.val[1]); + float32x4_t sum2 = vaddq_f32(row2.val[0], row2.val[1]); + + float32x4_t row02 = vextq_f32(row0.val[0], row0_next.val[0], 1); + float32x4_t row12 = vextq_f32(row1.val[0], row1_next.val[0], 1); + float32x4_t row22 = vextq_f32(row2.val[0], row2_next.val[0], 1); + + sum0 = vaddq_f32(sum0, row02); + sum1 = vaddq_f32(sum1, row12); + sum2 = vaddq_f32(sum2, row22); + + float32x4_t sum_result = vaddq_f32(vaddq_f32(sum0, sum1), sum2); + float32x4_t avg_result = vmulq_f32(sum_result, factors); + + vst1q_f32(outptr, avg_result); + + row0 = row0_next; + row1 = row1_next; + row2 = row2_next; + + r0 += 8; + r1 += 8; + r2 += 8; + outptr += 4; + } + + for (; w < out_width; ++w) { + float sum = 0.0; + for (int kh = 0; kh < 3; ++kh) { + for (int kw = 0; kw < 3; ++kw) { + int inh = h * 2 - padding_top + kh; + int inw = w * 2 - padding_left + kw; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + sum += input[input_offset + inh * in_width + inw]; + } + } + } + + *outptr = sum / 9.0; + ++outptr; + } + } + input_offset += in_image_size; + output_offset += out_image_size; + } + } +} + +// assume the input has already been padded +void PoolingAvgNeonK3x3S2x2Padded(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape) { + index_t batch = in_shape[0]; + index_t channels = in_shape[1]; + index_t in_height = in_shape[2]; + index_t in_width = in_shape[3]; + + index_t out_height = out_shape[2]; + index_t out_width = out_shape[3]; + + int in_image_size = in_height * in_width; + int out_image_size = out_height * out_width; + index_t input_offset = 0; + index_t output_offset = 0; + float avg_factors[4] = {1.0/9.0, 1.0/9.0, 1.0/9.0, 1.0/9.0}; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + const float *img0 = input + input_offset; + float *outptr = output + output_offset; + + const float *r0 = img0; + const float *r1 = r0 + in_width; + const float *r2 = r1 + in_width; + + for (int h = 0; h < out_height; h++) { + int num_vectors = out_width >> 2; + int remain = out_width - (num_vectors << 2); + + float32x4_t factors = vld1q_f32(avg_factors); + float32x4x2_t row0 = vld2q_f32(r0); + float32x4x2_t row1 = vld2q_f32(r1); + float32x4x2_t row2 = vld2q_f32(r2); + for (; num_vectors > 0; --num_vectors) { + float32x4x2_t row0_next = vld2q_f32(r0 + 8); + float32x4x2_t row1_next = vld2q_f32(r1 + 8); + float32x4x2_t row2_next = vld2q_f32(r2 + 8); + + float32x4_t sum0 = vaddq_f32(row0.val[0], row0.val[1]); + float32x4_t sum1 = vaddq_f32(row1.val[0], row1.val[1]); + float32x4_t sum2 = vaddq_f32(row2.val[0], row2.val[1]); + + float32x4_t row02 = vextq_f32(row0.val[0], row0_next.val[0], 1); + float32x4_t row12 = vextq_f32(row1.val[0], row1_next.val[0], 1); + float32x4_t row22 = vextq_f32(row2.val[0], row2_next.val[0], 1); + + sum0 = vaddq_f32(sum0, row02); + sum1 = vaddq_f32(sum1, row12); + sum2 = vaddq_f32(sum2, row22); + + float32x4_t sum_result = vaddq_f32(vaddq_f32(sum0, sum1), sum2); + float32x4_t avg_result = vmulq_f32(sum_result, factors); + + vst1q_f32(outptr, avg_result); + + row0 = row0_next; + row1 = row1_next; + row2 = row2_next; + + r0 += 8; + r1 += 8; + r2 += 8; + outptr += 4; + } + + for (; remain > 0; remain--) { + *outptr = (r0[0] + r0[1] + r0[2] + r1[0] + r1[1] + r1[2] + + r2[0] + r2[1] + r2[2]) / 9.0; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += 1 + in_width; + r1 += 1 + in_width; + r2 += 1 + in_width; + } + input_offset += in_image_size; + output_offset += out_image_size; + } + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc index 0e57f02e045dcd1aed45cb7893e8a4c33dba142f..06efdeaa5258455f5c9779487ef5fe562d2aede9 100644 --- a/mace/kernels/neon/pooling_neon.cc +++ b/mace/kernels/neon/pooling_neon.cc @@ -15,51 +15,95 @@ extern void PoolingMaxNeonK2x2S2x2(const float *input, const index_t *out_shape, const int *paddings); +extern void PoolingAvgNeonK2x2S2x2(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape, + const int *paddings); + extern void PoolingMaxNeonK3x3S2x2(const float *input, const index_t *in_shape, float *output, const index_t *out_shape, const int *paddings); +extern void PoolingAvgNeonK3x3S2x2(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape, + const int *paddings); + #ifdef __COPY_MAKE_PADDING extern void PoolingMaxNeonK2x2S2x2Padded(const float *input, const index_t *in_shape, float *output, const index_t *out_shape); + +extern void PoolingAvgNeonK2x2S2x2Padded(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape); + extern void PoolingMaxNeonK3x3S2x2Padded(const float *input, const index_t *in_shape, float *output, const index_t *out_shape); + +extern void PoolingAvgNeonK3x3S2x2Padded(const float *input, + const index_t *in_shape, + float *output, + const index_t *out_shape); #endif -template <> +template<> void PoolingFunctor::operator()( const float *input, const index_t *input_shape, float *output, const index_t *output_shape) { +#ifdef __COPY_MAKE_PADDING + Tensor padded_input; + ConstructInputWithPadding(input, input_shape, paddings_, &padded_input); + input = padded_input.data(); + input_shape = padded_input.shape().data(); +#endif + if (kernels_[0] == 2 && kernels_[1] == 2 && strides_[0] == 2 && - strides_[1] == 2 && pooling_type_ == MAX) { + strides_[1] == 2) { + // kernel_size: 2x2, strides: 2x2 + if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 +#ifdef __COPY_MAKE_PADDING + PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); +#else + PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, + paddings_); +#endif + } else { // AVG_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - Tensor padded_input; - ConstructInputWithPadding(input, input_shape, paddings_, &padded_input); - input = padded_input.data(); - input_shape = padded_input.shape().data(); - PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); #else - PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, paddings_); + PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, + paddings_); #endif + } } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && - strides_[1] == 2 && pooling_type_ == MAX) { + strides_[1] == 2) { + // kernel_size: 3x3, strides: 2x2 + if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 +#ifdef __COPY_MAKE_PADDING + PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); +#else + PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, + paddings_); +#endif + } else { // AVG_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - Tensor padded_input; - ConstructInputWithPadding(input, input_shape, paddings_, &padded_input); - input = padded_input.data(); - input_shape = padded_input.shape().data(); - PoolingMaxNeonK3x3S2x2V2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); #else - PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, paddings_); + PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, + paddings_); #endif + } } else { // not implement yet PoolingFunctor(pooling_type_, kernels_, strides_, paddings_, dilations_)( diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 3972743c76df9971ba88473551b80b4f96df6d9a..f5e9b599f20722766ace7ad98fb4b8011a239867 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -148,14 +148,14 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { net.AddIntsArg("dilations", {1, 1}); // Add input data - net.AddInputFromArray( - "Input", {1, 1, 4, 5}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}); + net.AddInputFromArray("Input", {1, 1, 2, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17}); // Run net.RunOp(DeviceType::NEON); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19}); + auto expected = CreateTensor({1, 1, 1, 5}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -172,18 +172,50 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) { net.AddIntArg("pooling_type", PoolingType::MAX); net.AddIntsArg("kernels", {3, 3}); net.AddIntsArg("strides", {2, 2}); + net.AddIntArg("padding", Padding::VALID); + net.AddIntsArg("dilations", {1, 1}); + + // Add input data + net.AddInputFromArray("Input", {1, 1, 3, 9}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26}); + // Run + net.RunOp(DeviceType::NEON); + + // Check + auto expected = CreateTensor({1, 1, 1, 4}, {20, 22, 24, 26}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); +} + +TEST_F(PoolingOpTest, AVG_k2x2s2x2) { + // Construct graph + auto& net = test_net(); + OpDefBuilder("Pooling", "PoolingTest") + .Input("Input") + .Output("Output") + .Finalize(net.operator_def()); + + // Add args + net.AddIntArg("pooling_type", PoolingType::AVG); + net.AddIntsArg("kernels", {2, 2}); + net.AddIntsArg("strides", {2, 2}); net.AddIntArg("padding", Padding::SAME); net.AddIntsArg("dilations", {1, 1}); // Add input data net.AddInputFromArray( - "Input", {1, 1, 4, 5}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}); + "Input", {1, 1, 2, 8}, + {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(DeviceType::NEON); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19}); + auto expected = CreateTensor({1, 1, 1, 4}, + {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } +