From 67a2f0ea94cb8ea51cd8f84e7e0b53ff5aa8ea14 Mon Sep 17 00:00:00 2001 From: ZhenWang Date: Tue, 4 Dec 2018 12:19:47 +0800 Subject: [PATCH] add int8_t type pooling support. --- .../kernel/central-arm-func/pool_arm_func.h | 85 +-- src/operators/math/pool_3x3.cpp | 3 +- src/operators/math/pool_3x3.h | 24 +- src/operators/math/pool_3x3_int8.cpp | 562 ++++++++++++++++++ src/operators/math/pooling.cpp | 10 +- src/operators/math/pooling.h | 36 +- test/CMakeLists.txt | 4 +- .../test_fusion_conv_add_relu_int8_op.cpp | 2 +- test/operators/test_pool_op.cpp | 283 ++++++++- 9 files changed, 933 insertions(+), 76 deletions(-) create mode 100644 src/operators/math/pool_3x3_int8.cpp diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h index 941c237865..95149732ca 100644 --- a/src/operators/kernel/central-arm-func/pool_arm_func.h +++ b/src/operators/kernel/central-arm-func/pool_arm_func.h @@ -23,20 +23,22 @@ namespace paddle_mobile { namespace operators { using framework::Tensor; -inline void PoolBasic(std::string pooling_type, std::vector ksize, - std::vector strides, std::vector paddings, - const Tensor *in_x, Tensor *out) { +template +void PoolBasic(std::string pooling_type, std::vector ksize, + std::vector strides, std::vector paddings, + const Tensor *in_x, Tensor *out) { if (pooling_type == "max") { - math::PoolFunctor, float> pool2d_forward; - math::MaxPool pool_process; + math::PoolFunctor, T> pool2d_forward; + math::MaxPool pool_process; pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); } else if (pooling_type == "avg") { - math::PoolFunctor, float> pool2d_forward; - math::AvgPool pool_process; + math::PoolFunctor, T> pool2d_forward; + math::AvgPool pool_process; pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out); } } + template void PoolCompute(const PoolParam ¶m) { const Tensor *in_x = param.Input(); @@ -52,50 +54,65 @@ void PoolCompute(const PoolParam ¶m) { LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "Pool op only supports 2D and 3D input."; } - if (param.isGlobalPooling()) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; ksize[i] = static_cast(in_x->dims()[i + 2]); } } - if (ksize[0] == 3 && ksize[0] == ksize[1]) { - if (pooling_type == "max") { - if (strides[0] == strides[1] && strides[0] == 1 && - paddings[0] == paddings[1] && paddings[1] == 1) { - math::Pool3x3Maxs1p1(in_x, out); + if (in_x->type() == typeid(int8_t)) { + if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) { + if (strides[0] == strides[1] && strides[0] == 1) { + math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]); + } else if (strides[0] == strides[1] && strides[0] == 2) { + math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]); } else { - math::Pool3x3Max(strides, paddings, in_x, out); - } - } else if (pooling_type == "avg") { - if (strides[0] == strides[1] && strides[0] == 1 && - paddings[0] == paddings[1] && paddings[1] == 1) { - math::Pool3x3Avgs1p1(in_x, out); - } else { - math::Pool3x3Avg(strides, paddings, in_x, out); + math::Pool3x3Max_int8(strides, paddings, in_x, out); } + } else { + PoolBasic(pooling_type, ksize, strides, paddings, in_x, + out); } + } else { + if (ksize[0] == 3 && ksize[0] == ksize[1]) { + if (pooling_type == "max") { + if (strides[0] == strides[1] && strides[0] == 1 && + paddings[0] == paddings[1] && paddings[1] == 1) { + math::Pool3x3Maxs1p1(in_x, out); + } else { + math::Pool3x3Max(strides, paddings, in_x, out); + } + } else if (pooling_type == "avg") { + if (strides[0] == strides[1] && strides[0] == 1 && + paddings[0] == paddings[1] && paddings[1] == 1) { + math::Pool3x3Avgs1p1(in_x, out); + } else { + math::Pool3x3Avg(strides, paddings, in_x, out); + } + } - } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 && - strides[0] == strides[1] && paddings[0] == paddings[1] && - paddings[1] == 0) { + } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 && + strides[0] == strides[1] && paddings[0] == paddings[1] && + paddings[1] == 0) { #if __ARM_NEON #if __aarch64__ - PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); + PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); #else - /// todo: fix bug in Pool2x2 - if (pooling_type == "max") { - math::Pool2x2Maxs2p0(strides, paddings, in_x, out); - } else if (pooling_type == "avg") { - math::Pool2x2Avgs2p0(strides, paddings, in_x, out); - } + /// todo: fix bug in Pool2x2 + if (pooling_type == "max") { + math::Pool2x2Maxs2p0(strides, paddings, in_x, out); + } else if (pooling_type == "avg") { + math::Pool2x2Avgs2p0(strides, paddings, in_x, out); + } #endif #else - PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); + PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); #endif // __ARM_NEON - } else { - PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); + } else { + PoolBasic(pooling_type, ksize, strides, paddings, in_x, + out); + } } } diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp index dadb5a67cf..a2b84a5b14 100644 --- a/src/operators/math/pool_3x3.cpp +++ b/src/operators/math/pool_3x3.cpp @@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { const int input_width = static_cast(input->dims()[3]); const int output_height = static_cast(output->dims()[2]); const int output_width = static_cast(output->dims()[3]); + output->mutable_data(); const int hxw = input_height * input_width; @@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { const int inputdata_channel_stride = h_in * w_in; const int input_batch_stride = output_channels * inputdata_channel_stride; const int output_batch_stride = output_channels * outputdata_channel_stride; - float *out_data = output->data(); + float *out_data = output->mutable_data(); const float *input_data = input->data(); for (int k = 0; k < batch_size; ++k) { #pragma omp parallel for diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h index ac1eb16a4c..a13cb6ab37 100644 --- a/src/operators/math/pool_3x3.h +++ b/src/operators/math/pool_3x3.h @@ -28,15 +28,21 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { namespace math { -using framework::Tensor; -using std::vector; -void Pool3x3Avgs1p1(const Tensor *input, Tensor *output); -void Pool3x3Maxs1p1(const Tensor *input, Tensor *output); -void Pool3x3Max(vector strides, vector paddings, const Tensor *input, - Tensor *output); - -void Pool3x3Avg(vector strides, vector paddings, const Tensor *in_x, - Tensor *out); +void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output); +void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output); +void Pool3x3Max(std::vector strides, std::vector paddings, + const framework::Tensor *input, framework::Tensor *output); + +void Pool3x3Avg(std::vector strides, std::vector paddings, + const framework::Tensor *in_x, framework::Tensor *out); + +void Pool3x3Maxs1_int8(const framework::Tensor *input, + framework::Tensor *output, int32_t pad_h, int32_t pad_w); +void Pool3x3Maxs2_int8(const framework::Tensor *input, + framework::Tensor *output, int32_t pad_h, int32_t pad_w); +void Pool3x3Max_int8(const std::vector &strides, + const std::vector &paddings, + const framework::Tensor *input, framework::Tensor *output); } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pool_3x3_int8.cpp b/src/operators/math/pool_3x3_int8.cpp new file mode 100644 index 0000000000..3698a2132b --- /dev/null +++ b/src/operators/math/pool_3x3_int8.cpp @@ -0,0 +1,562 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef POOL_OP +#ifdef _OPENMP +#include +#endif +#include "framework/tensor.h" +#include "operators/math/pool_3x3.h" +#if __ARM_NEON +#include +#endif // __ARM_NEON +#include +#include +namespace paddle_mobile { +namespace operators { +namespace math { +using framework::Tensor; +using std::max; +using std::min; +using std::vector; +template +static void make_paddings(const Tensor *input, Tensor *padded_input, + int32_t top, int32_t bottom, int32_t left, + int32_t right, T value) { + const int32_t batch_size = input->dims()[0]; + const int32_t c_in = input->dims()[1]; + const int32_t h_in = input->dims()[2]; + const int32_t w_in = input->dims()[3]; + const int32_t h_padded = h_in + top + bottom; + const int32_t w_padded = w_in + left + right; + padded_input->Resize({batch_size, c_in, h_padded, w_padded}); + T *padded_input_data = padded_input->mutable_data(); + const T *input_data = input->data(); + const int32_t input_channel_stride = h_in * w_in; + const int32_t input_batch_stride = c_in * input_channel_stride; + const int32_t padded_channel_stride = h_padded * w_padded; + const int32_t padded_batch_stride = c_in * padded_channel_stride; + for (int i = 0; i < batch_size; ++i) { +#pragma omp parallel for + for (int j = 0; j < c_in; ++j) { + const T *img_in = input_data + j * input_channel_stride; + T *img_padded = padded_input_data + j * padded_channel_stride; + int k = 0; + for (; k < top; ++k) { + for (int l = 0; l < w_padded; ++l) { + img_padded[l] = value; + } + img_padded += w_padded; + } + for (; k < top + h_in; ++k) { + int l = 0; + for (; l < left; ++l) { + img_padded[l] = value; + } + memcpy(img_padded + left, img_in, w_in * sizeof(T)); + l += w_in; + img_in += w_in; + for (; l < w_padded; ++l) { + img_padded[l] = value; + } + img_padded += w_padded; + } + for (; k < h_padded; ++k) { + for (int l = 0; l < w_padded; ++l) { + img_padded[l] = value; + } + img_padded += w_padded; + } + } + input_data += input_batch_stride; + padded_input_data += padded_batch_stride; + } + // input_data = input->data(); + // std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++" + // << std::endl; + // for (int i = 0; i < 1; ++i) { + // for (int j = 0; j < 1; ++j) { + // const T *img_in = input_data + j * input_channel_stride; + // for (int k = 0; k < h_in; ++k) { + // for (int l = 0; l < w_in; ++l) { + // std::cout << (int32_t)*img_in << "\t"; + // img_in++; + // } + // std::cout << std::endl; + // } + // } + // input_data += input_batch_stride; + // } + // std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" << + // std::endl; + // + // padded_input_data = padded_input->data(); + // std::cout << "******************Padding begin**********************" + // << std::endl; + // for (int i = 0; i < 1; ++i) { + // for (int j = 0; j < 1; ++j) { + // T *img_padded = padded_input_data + j * padded_channel_stride; + // for (int k = 0; k < h_padded; ++k) { + // for (int l = 0; l < w_padded; ++l) { + // std::cout << (int32_t)*img_padded << "\t"; + // img_padded++; + // } + // std::cout << std::endl; + // } + // } + // padded_input_data += padded_batch_stride; + // } + // std::cout << "******************Padding end**********************" + // << std::endl; +} +void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h, + int32_t pad_w) { + Tensor padded_input; + if (pad_h != 0 && pad_w != 0) { + int8_t value = -SCHAR_MAX; + make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value); + input = &padded_input; + } + const int32_t batch_size = input->dims()[0]; + const int32_t h_in = input->dims()[2]; + const int32_t w_in = input->dims()[3]; + const int8_t *input_data = input->data(); + const int32_t output_channels = output->dims()[1]; + const int32_t h_out = output->dims()[2]; + int32_t w_out = output->dims()[3]; + int8_t *output_data = output->mutable_data(); + const int32_t outputdata_channel_stride = h_out * w_out; + const int32_t inputdata_channel_stride = h_in * w_in; + const int32_t input_batch_stride = output_channels * inputdata_channel_stride; + const int32_t output_batch_stride = + output_channels * outputdata_channel_stride; + // std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl; + for (int i = 0; i < batch_size; ++i) { +#pragma omp parallel for + for (int j = 0; j < output_channels; ++j) { + const int8_t *img_in = input_data + j * inputdata_channel_stride; + int8_t *img_out = output_data + j * outputdata_channel_stride; + for (int k = 0; k < h_out; ++k) { + const int8_t *row0 = img_in + k * w_in; + const int8_t *row1 = img_in + (k + 1) * w_in; + const int8_t *row2 = img_in + (k + 2) * w_in; +#if __ARM_NEON + int32_t nw = w_out >> 4; + int32_t left_w = w_out & 0xf; + int32_t nw1 = left_w >> 3; + int32_t left_w1 = left_w & 0x7; +#if __aarch64__ + // TODO(wzzju) +#else + if (nw > 0) { +#define LOOP_LABEL "1" + // result: q15 + asm volatile( + "vld1.8 {q0}, [%[row0]]! \n\t" // q0=0-15 + "vld1.8 {q2}, [%[row1]]! \n\t" + "vld1.8 {q4}, [%[row2]]! \n\t" + + LOOP_LABEL + ": \n\t" + "vld1.8 {q1}, [%[row0]]! \n\t" // q1=16-31 + "vext.8 q6, q0, q1, #1 \n\t" + "vext.8 q7, q0, q1, #2 \n\t" + "vld1.8 {q3}, [%[row1]]! \n\t" + "vmax.s8 q15, q0, q6 \n\t" + "vmax.s8 q15, q15, q7 \n\t" + "vext.8 q6, q2, q3, #1 \n\t" + "vext.8 q7, q2, q3, #2 \n\t" + "vld1.8 {q5}, [%[row2]]! \n\t" + "vmax.s8 q14, q2, q6 \n\t" + "vmax.s8 q14, q14, q7 \n\t" + "vext.8 q6, q4, q5, #1 \n\t" + "vext.8 q7, q4, q5, #2 \n\t" + "vmax.s8 q13, q4, q6 \n\t" + "vmax.s8 q13, q13, q7 \n\t" + "vmax.s8 q15, q15, q14 \n\t" + "vmax.s8 q15, q15, q13 \n\t" + "vmov.s8 q0, q1 \n\t" + "vmov.s8 q2, q3 \n\t" + "vmov.s8 q4, q5 \n\t" + "vst1.8 {q15}, [%[img_out]]! \n\t" + "subs %[nw], #1 \n\t" + "bne " LOOP_LABEL + "b \n\t" + "sub %[row0], #16 \n\t" + "sub %[row1], #16 \n\t" + "sub %[row2], #16 \n\t" + : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1), + [row2] "+r"(row2), [img_out] "+r"(img_out) + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q13", "q14", "q15"); +#undef LOOP_LABEL + } + if (nw1 > 0 || left_w1 > 0) { +#define PADDLE_LABEL_LESS8 "1" +#define PADDLE_LABEL_LESS8_SAVE "2" +#define PADDLE_LABEL_OVER "3" + // result: d15 + asm volatile( + "vld1.8 {d0}, [%[row0]]! \n\t" // d0=0-8 + "vld1.8 {d2}, [%[row1]]! \n\t" + "vld1.8 {d4}, [%[row2]]! \n\t" + "mov r0, #1 \n\t" + "cmp %[nw1], #0 \n\t" + "beq " PADDLE_LABEL_LESS8 + "f\n\t" + "vld1.8 {d1}, [%[row0]]! \n\t" // d1=9-15 + "vext.8 d6, d0, d1, #1 \n\t" + "vext.8 d7, d0, d1, #2 \n\t" + "vld1.8 {d3}, [%[row1]]! \n\t" + "vmax.s8 d15, d0, d6 \n\t" + "vmax.s8 d15, d15, d7 \n\t" + "vext.8 d6, d2, d3, #1 \n\t" + "vext.8 d7, d2, d3, #2 \n\t" + "vld1.8 {d5}, [%[row2]]! \n\t" + "vmax.s8 d14, d2, d6 \n\t" + "vmax.s8 d14, d14, d7 \n\t" + "vext.8 d6, d4, d5, #1 \n\t" + "vext.8 d7, d4, d5, #2 \n\t" + "vmax.s8 d13, d4, d6 \n\t" + "vmax.s8 d13, d13, d7 \n\t" + "vmax.s8 d15, d15, d14 \n\t" + "vmax.s8 d15, d15, d13 \n\t" + "vmov.s8 d0, d1 \n\t" + "vmov.s8 d2, d3 \n\t" + "vmov.s8 d4, d5 \n\t" + "vst1.8 {d15}, [%[img_out]]! \n\t" + + PADDLE_LABEL_LESS8 + ": \n\t" + "cmp %[left_w1], #0 \n\t" + "beq " PADDLE_LABEL_OVER + "f\n\t" + "vld1.8 {d1}, [%[row0]] \n\t" // d1=9-15 + "vext.8 d6, d0, d1, #1 \n\t" + "vext.8 d7, d0, d1, #2 \n\t" + "vld1.8 {d3}, [%[row1]] \n\t" + "vmax.s8 d15, d0, d6 \n\t" + "vmax.s8 d15, d15, d7 \n\t" + "vext.8 d6, d2, d3, #1 \n\t" + "vext.8 d7, d2, d3, #2 \n\t" + "vld1.8 {d5}, [%[row2]] \n\t" + "vmax.s8 d14, d2, d6 \n\t" + "vmax.s8 d14, d14, d7 \n\t" + "vext.8 d6, d4, d5, #1 \n\t" + "vext.8 d7, d4, d5, #2 \n\t" + "vmax.s8 d13, d4, d6 \n\t" + "vmax.s8 d13, d13, d7 \n\t" + "vmax.s8 d15, d15, d14 \n\t" + "vmax.s8 d15, d15, d13 \n\t" + + PADDLE_LABEL_LESS8_SAVE + ": \n\t" + "vst1.8 {d15}, [%[img_out]], r0\n\t" + "add %[row0], %[row0], #1 \n\t" + "add %[row1], %[row1], #1 \n\t" + "add %[row2], %[row2], #1 \n\t" + "vext.8 d15, d15, d15, #1 \n\t" + "subs %[left_w1], #1 \n\t" + "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t" + + PADDLE_LABEL_OVER ": \n\t" + : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0), + [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out) + : + : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7", "d13", "d14", "d15"); +#undef PADDLE_LABEL_OVER +#undef PADDLE_LABEL_LESS8_SAVE +#undef PADDLE_LABEL_LESS8 + } +#endif // __aarch64__ +#else + int32_t left = w_out; + while (left > 0) { + const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]); + const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]); + const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]); + *img_out = std::max(std::max(max0, max1), max2); + row0 += 1; + row1 += 1; + row2 += 1; + img_out++; + left--; + } +#endif // __ARM_NEON + } + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +} +void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h, + int32_t pad_w) { + Tensor padded_input; + if (pad_h != 0 && pad_w != 0) { + int8_t value = -SCHAR_MAX; + make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value); + input = &padded_input; + } + const int32_t batch_size = input->dims()[0]; + const int32_t h_in = input->dims()[2]; + const int32_t w_in = input->dims()[3]; + const int32_t output_channels = output->dims()[1]; + const int32_t h_out = output->dims()[2]; + int32_t w_out = output->dims()[3]; + const int32_t outputdata_channel_stride = h_out * w_out; + const int32_t inputdata_channel_stride = h_in * w_in; + const int32_t output_batch_stride = + output_channels * outputdata_channel_stride; + const int32_t input_batch_stride = output_channels * inputdata_channel_stride; + const int8_t *input_data = input->data(); + int8_t *output_data = output->mutable_data(); + for (int i = 0; i < batch_size; ++i) { +#pragma omp parallel for + for (int j = 0; j < output_channels; ++j) { + const int8_t *img_in = input_data + j * inputdata_channel_stride; + int8_t *img_out = output_data + j * outputdata_channel_stride; + for (int k = 0; k < h_out; ++k) { + const int8_t *row0 = img_in + 2 * k * w_in; + const int8_t *row1 = img_in + (2 * k + 1) * w_in; + const int8_t *row2 = img_in + (2 * k + 2) * w_in; +#if __ARM_NEON + int32_t nw = w_out >> 4; + int32_t left_w = w_out & 0xf; + int32_t nw1 = left_w >> 3; + int32_t left_w1 = left_w & 0x7; +#if __aarch64__ + // TODO(wzzju) +#else + if (nw > 0) { +#define LOOP_LABEL "1" + // result: q15 + asm volatile( + "vld2.8 {q0, q1}, [%[row0]]! \n\t" // q0=0-30, q1=1-31 + "vld2.8 {q2, q3}, [%[row1]]! \n\t" + "vld2.8 {q4, q5}, [%[row2]]! \n\t" LOOP_LABEL + ": \n\t" + "vmax.s8 q15, q0, q1 \n\t" + "vld2.8 {q6, q7}, [%[row0]]! \n\t" // q0=32-62, q1=33-63 + "vmax.s8 q14, q2, q3 \n\t" + "vmax.s8 q13, q4, q5 \n\t" + "vld2.8 {q8, q9}, [%[row1]]! \n\t" + "vext.8 q0, q0, q6, #1 \n\t" + "vmax.s8 q15, q15, q0 \n\t" + "vld2.8 {q10, q11}, [%[row2]]! \n\t" + "vext.8 q2, q2, q8, #1 \n\t" + "vmax.s8 q14, q14, q2 \n\t" + "vext.8 q4, q4, q10, #1 \n\t" + "vmax.s8 q13, q13, q4 \n\t" + "vmax.s8 q15, q15, q14 \n\t" + "vmax.s8 q15, q15, q13 \n\t" + "vmov.s8 q0, q6 \n\t" + "vmov.s8 q1, q7 \n\t" + "vmov.s8 q2, q8 \n\t" + "vmov.s8 q3, q9 \n\t" + "vmov.s8 q4, q10 \n\t" + "vmov.s8 q5, q11 \n\t" + "vst1.8 {q15}, [%[img_out]]! \n\t" + "subs %[nw], #1 \n\t" + "bne " LOOP_LABEL + "b \n\t" + "sub %[row0], #32 \n\t" + "sub %[row1], #32 \n\t" + "sub %[row2], #32 \n\t" + : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1), + [row2] "+r"(row2), [img_out] "+r"(img_out) + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q13", "q14", "q15"); +#undef LOOP_LABEL + } + if (nw1 > 0 || left_w1 > 0) { +#define PADDLE_LABEL_LESS8 "1" +#define PADDLE_LABEL_LESS8_SAVE "2" +#define PADDLE_LABEL_OVER "3" + // result: d15 + asm volatile( + "vld2.8 {d0, d1}, [%[row0]]! \n\t" // d0=0-14, d1=1-15 + "vld2.8 {d2, d3}, [%[row1]]! \n\t" + "vld2.8 {d4, d5}, [%[row2]]! \n\t" + "mov r0, #1 \n\t" + "cmp %[nw1], #0 \n\t" + "beq " PADDLE_LABEL_LESS8 + "f\n\t" + "vmax.s8 d15, d0, d1 \n\t" + "vld2.8 {d6, d7}, [%[row0]]! \n\t" // d0=32-62, d1=33-63 + "vmax.s8 d14, d2, d3 \n\t" + "vmax.s8 d13, d4, d5 \n\t" + "vld2.8 {d8, d9}, [%[row1]]! \n\t" + "vext.8 d0, d0, d6, #1 \n\t" + "vmax.s8 d15, d15, d0 \n\t" + "vld2.8 {d10, d11}, [%[row2]]! \n\t" + "vext.8 d2, d2, d8, #1 \n\t" + "vmax.s8 d14, d14, d2 \n\t" + "vext.8 d4, d4, d10, #1 \n\t" + "vmax.s8 d13, d13, d4 \n\t" + "vmax.s8 d15, d15, d14 \n\t" + "vmax.s8 d15, d15, d13 \n\t" + "vmov.s8 d0, d6 \n\t" + "vmov.s8 d1, d7 \n\t" + "vmov.s8 d2, d8 \n\t" + "vmov.s8 d3, d9 \n\t" + "vmov.s8 d4, d10 \n\t" + "vmov.s8 d5, d11 \n\t" + "vst1.8 {d15}, [%[img_out]]! \n\t" + + PADDLE_LABEL_LESS8 + ": \n\t" + "cmp %[left_w1], #0 \n\t" + "beq " PADDLE_LABEL_OVER + "f\n\t" + "vmax.s8 d15, d0, d1 \n\t" + "vld2.8 {d6, d7}, [%[row0]] \n\t" // d0=32-62, d1=33-63 + "vmax.s8 d14, d2, d3 \n\t" + "vmax.s8 d13, d4, d5 \n\t" + "vld2.8 {d8, d9}, [%[row1]] \n\t" + "vext.8 d0, d0, d6, #1 \n\t" + "vmax.s8 d15, d15, d0 \n\t" + "vld2.8 {d10, d11}, [%[row2]] \n\t" + "vext.8 d2, d2, d8, #1 \n\t" + "vmax.s8 d14, d14, d2 \n\t" + "vext.8 d4, d4, d10, #1 \n\t" + "vmax.s8 d13, d13, d4 \n\t" + "vmax.s8 d15, d15, d14 \n\t" + "vmax.s8 d15, d15, d13 \n\t" + + PADDLE_LABEL_LESS8_SAVE + ": \n\t" + "vst1.8 {d15}, [%[img_out]], r0\n\t" + "add %[row0], %[row0], #2 \n\t" + "add %[row1], %[row1], #2 \n\t" + "add %[row2], %[row2], #2 \n\t" + "vext.8 d15, d15, d15, #1 \n\t" + "subs %[left_w1], #1 \n\t" + "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t" + + PADDLE_LABEL_OVER ": \n\t" + : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0), + [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out) + : + : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15"); +#undef PADDLE_LABEL_OVER +#undef PADDLE_LABEL_LESS8_SAVE +#undef PADDLE_LABEL_LESS8 + } +#endif // __aarch64__ +#else + int32_t left = w_out; + while (left > 0) { + const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]); + const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]); + const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]); + *img_out = std::max(std::max(max0, max1), max2); + row0 += 2; + row1 += 2; + row2 += 2; + img_out++; + left--; + } +#endif // __ARM_NEON + } + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +} +void Pool3x3Max_int8(const vector &strides, const vector &paddings, + const Tensor *input, Tensor *output) { + const int batch_size = input->dims()[0]; + const int input_height = input->dims()[2]; + const int input_width = input->dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + // const int _kernel_size = 3; + const int stride = strides[0]; + // const int stride_width = strides[1]; + const int padding = paddings[0]; + // const int padding_width = paddings[1]; + const int8_t negative_max = -SCHAR_MAX; + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + const int8_t *input_data = input->data(); + int8_t *output_data = output->mutable_data(); + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + for (int i = 0; i < batch_size; ++i) { +#pragma omp parallel for + for (int c = 0; c < output_channels; ++c) { + const int8_t *input_seg = input_data + c * input_channel_stride; + int8_t *output_seg = output_data + c * output_channel_stride; + for (int ph = 0; ph < output_height; ph++) { + int hstart = ph * stride - padding; + int hend = min(hstart + 3, input_height); + hstart = max(hstart, 0); + for (int pw = 0; pw < output_width; pw++) { + int wstart = pw * stride - padding; + int wend = min(wstart + 3, input_width); + wstart = max(wstart, 0); + const int8_t *pos1 = input_seg + hstart * input_width + wstart; + const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart; + const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart; + int8_t *output_ptr = output_seg + ph * output_width + pw; + if (hend - hstart != 3 || wend - wstart != 3) { + int8_t max_value = -SCHAR_MAX; + for (int h = hstart; h < hend; h++) { + for (int w = wstart; w < wend; w++) { + int8_t value = input_seg[h * input_width + w]; + if (value > max_value) { + max_value = value; + } + } + } + output_seg[ph * output_width + pw] = max_value; + } else { +#if __ARM_NEON +#if __aarch64__ + // TODO(wzzju) +#else + asm volatile( + "vld1.8 {d0}, [%[pos1]] \n\t" + "vld1.8 {d1}, [%[pos2]] \n\t" + "vld1.8 {d2}, [%[pos3]] \n\t" + "vmax.s8 d3, d0, d1 \n\t" + "vmax.s8 d4, d2, d3 \n\t" + "vmov.s8 d4[3], %[negative_max] \n\t" + "vpmax.s8 d5, d4, d4 \n\t" + "vpmax.s8 d6, d5, d5 \n\t" + "vst1.8 {d6[0]},[%[output_ptr]] \n\t" + : + : [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3), + [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) + : "memory", "q0", "q1", "q2", "q3"); +#endif +#else + const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]); + const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]); + const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]); + *output_ptr = std::max(std::max(max0, max1), max2); +#endif // __ARM_NEON + } + } + } + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +} +} // namespace math +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp index f5bcdf7fdb..17df4a26aa 100644 --- a/src/operators/math/pooling.cpp +++ b/src/operators/math/pooling.cpp @@ -70,15 +70,15 @@ class PoolFunctor { int wend = std::min(wstart + ksize_width, input_width); wstart = std::max(wstart, 0); - T ele = pool_process.initial(); + auto ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { pool_process.compute(input_data[h * input_width + w], &ele); } } int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(static_cast(pool_size), &ele); - output_data[ph * output_width + pw] = ele; + pool_process.finalize(static_cast(pool_size), &ele); + output_data[ph * output_width + pw] = static_cast(ele); } } input_data += input_stride; @@ -88,8 +88,10 @@ class PoolFunctor { } }; -template class PoolFunctor, float>; +template class PoolFunctor, float>; template class PoolFunctor, float>; +template class PoolFunctor, int8_t>; +template class PoolFunctor, int8_t>; } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h index 3ca868fa4d..4d94550cc3 100644 --- a/src/operators/math/pooling.h +++ b/src/operators/math/pooling.h @@ -16,6 +16,8 @@ limitations under the License. */ #pragma once +#include +#include #include "common/log.h" #include "framework/tensor.h" #include "pool_2x2.h" @@ -37,24 +39,42 @@ namespace math { * in pool pooling, and finally takes the average. * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. */ -template +template class MaxPool { public: - inline T initial() { return static_cast(-FLT_MAX); } + inline T initial() { + if (typeid(T) == typeid(int8_t)) { + return static_cast(-SCHAR_MAX); + } + return static_cast(-FLT_MAX); + } inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; } inline void finalize(const T &pool_field, T *y) {} }; -template +template class AvgPool { public: - inline T initial() { return static_cast(0); } - - inline void compute(const T &x, T *y) { *y += x; } - - inline void finalize(const T &pool_field, T *y) { *y /= pool_field; } + inline Otype initial() { return static_cast(0); } + + inline void compute(const Itype &x, Otype *y) { *y += x; } + + inline void finalize(const float &pool_field, Otype *y) { + if (typeid(Itype) == typeid(int8_t)) { + float tmp = *y / pool_field; + if (tmp > SCHAR_MAX) { + *y = SCHAR_MAX; + } else if (tmp < -SCHAR_MAX) { + *y = -SCHAR_MAX; + } else { + *y = static_cast(std::round(tmp)); + } + } else { + *y /= pool_field; + } + } }; template diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0f489995cb..3d202e2bd1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -269,8 +269,8 @@ if (NOT FOUND_MATCH) #gen test - ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-pool paddle-mobile) + ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-pool-op paddle-mobile) #gen test ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) diff --git a/test/operators/test_fusion_conv_add_relu_int8_op.cpp b/test/operators/test_fusion_conv_add_relu_int8_op.cpp index add38b34f1..42c68e5d04 100644 --- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp +++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp @@ -251,7 +251,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) { attrs["groups"].Set(1); attrs["axis"].Set(0); - auto *op = new operators::FusionConvAddReluInt8Op( + auto *op = new operators::FusionConvAddReluInt8Op( "fusion_conv_add_relu_int8", inputs, outputs, attrs, scope); op->InferShape(); op->Init(); diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp index 09470caf82..48bf00c2ac 100644 --- a/test/operators/test_pool_op.cpp +++ b/test/operators/test_pool_op.cpp @@ -12,30 +12,279 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "../test_include.h" +#include "operators/kernel/central-arm-func/pool_arm_func.h" #include "operators/pool_op.h" -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_googlenet)); - if (program.originProgram == nullptr) { - DLOG << "program read file"; +namespace paddle_mobile { +static int PoolOutputSize(int input_size, int filter_size, int padding, + int stride, bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = (input_size - filter_size + 2 * padding) / stride + 1; + } else { + output_size = + (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; } + return output_size; +} + +template +static void PoolAvgPad0(std::vector ksize, std::vector strides, + const framework::Tensor *input, + framework::Tensor *out) { + const int32_t batch_size = input->dims()[0]; + const int32_t input_c = input->dims()[1]; + const int32_t input_h = input->dims()[2]; + const int32_t input_w = input->dims()[3]; + const int32_t out_c = out->dims()[1]; + const int32_t out_h = out->dims()[2]; + const int32_t out_w = out->dims()[3]; + const int32_t kernel_h = ksize[0]; + const int32_t kernel_w = ksize[1]; + const int32_t stride_h = strides[0]; + const int32_t stride_w = strides[1]; + const int32_t inputdata_channel_stride = input_h * input_w; + const int32_t input_batch_stride = input_c * inputdata_channel_stride; + const int32_t outputdata_channel_stride = out_h * out_w; + const int32_t output_batch_stride = out_c * outputdata_channel_stride; + T *out_data = out->mutable_data(); + const T *input_data = input->data(); + const T **rows = new const T *[kernel_h]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < out_c; ++j) { + const T *img_in = input_data + j * inputdata_channel_stride; + T *img_out = out_data + j * outputdata_channel_stride; + for (int k = 0; k < out_h; ++k) { + for (int m = 0; m < kernel_h; ++m) { + rows[m] = img_in + (stride_h * k + m) * input_w; + } + int32_t left = out_w; + while (left > 0) { + float tmp = 0; + for (int m = 0; m < kernel_h; ++m) { + for (int l = 0; l < kernel_w; ++l) { + tmp += rows[m][l]; + } + } + if (typeid(T) == typeid(int8_t)) { + tmp = tmp / (kernel_h * kernel_w); + if (tmp < -127) { + *img_out = -127; + } else if (tmp > 127) { + *img_out = 127; + } else { + *img_out = static_cast(std::round(tmp)); + } + } else { + *img_out = static_cast(tmp / (kernel_h * kernel_w)); + } + for (int m = 0; m < kernel_h; ++m) { + rows[m] += stride_w; + } + img_out++; + left--; + } + } + } + input_data += input_batch_stride; + out_data += output_batch_stride; + } + delete[] rows; +} + +template +int TestPoolOp(int in_channels, int in_height, int in_width) { + int kernel_h = Kernel; + int kernel_w = Kernel; + int pad_h = Pad; + int pad_w = Pad; + int stride_h = Stride; + int stride_w = Stride; + bool ceil_mode = CeilMode != 0; + std::string pooling_type = (PoolType == 0 ? "max" : "avg"); + + int batch_size = 1; + int input_c = in_channels; + int input_h = in_height; + int input_w = in_width; + + framework::DDim input_shape = + framework::make_ddim({batch_size, input_c, input_h, input_w}); + + std::vector output_shape_v({batch_size, input_c}); + output_shape_v.push_back( + PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode)); + output_shape_v.push_back( + PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode)); + + framework::DDim output_shape = framework::make_ddim(output_shape_v); - Executor4Test> - executor(program, "pool2d"); + VariableNameMap inputs; + VariableNameMap outputs; + auto scope = std::make_shared(); + inputs["X"] = std::vector({"input"}); + outputs["Out"] = std::vector({"output"}); - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {1, 64, 112, 112}, static_cast(0), - static_cast(1)); - auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56}); - auto output = - executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim); + auto input_var = scope.get()->Var("input"); + auto input = input_var->template GetMutable(); + SetupTensor(input, input_shape, -127, 127); - float *output_ptr = output->data(); - for (int j = 0; j < output->numel(); ++j) { - DLOG << " value of output: " << output_ptr[j]; + auto output_var = scope.get()->Var("output"); + framework::AttributeMap attrs; + attrs["pooling_type"].SetString(pooling_type); + attrs["ksize"].Set>(std::vector({kernel_h, kernel_w})); + attrs["strides"].Set>(std::vector({stride_h, stride_w})); + attrs["paddings"].Set>(std::vector({pad_h, pad_w})); + attrs["ceil_mode"].Set(false); + attrs["global_pooling"].Set(false); + + auto *op = new operators::PoolOp("pool2d", inputs, outputs, attrs, + scope); + op->InferShape(); + op->Init(); + op->Run(); + + framework::Tensor output_cmp; + output_cmp.mutable_data(output_shape); + if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) { + PoolAvgPad0(std::vector{kernel_h, kernel_w}, + std::vector{stride_h, stride_w}, input, &output_cmp); + } else { + if (typeid(T) == typeid(int8_t)) { + operators::PoolBasic( + pooling_type, std::vector{kernel_h, kernel_w}, + std::vector{stride_h, stride_w}, std::vector{pad_h, pad_w}, + input, &output_cmp); + } else { + operators::PoolBasic( + pooling_type, std::vector{kernel_h, kernel_w}, + std::vector{stride_h, stride_w}, std::vector{pad_h, pad_w}, + input, &output_cmp); + } + } + + // compare results + int eq = 0; + int neq = 0; + auto output = output_var->template Get(); + const T *output_data = output->data(); + T *output_cmp_data = output_cmp.data(); + for (int i = 0; i < output->numel(); ++i) { + PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], + "The execution of test_pool_op is failed!"); + if (output_data[i] == output_cmp_data[i]) { + ++eq; + } else { + ++neq; + } } + std::cout << "eq = " << eq << ", neq = " << neq << std::endl; + delete op; + return 0; } +} // namespace paddle_mobile + +int main(int argc, char *argv[]) { + if (argc < 4) { + LOG(paddle_mobile::kLOG_INFO) + << "Usage:\n" + << " ./test-pool-op in_channels in_height in_width \n" + << " params:\n" + << " -in_channels: int, input image's channels\n" + << " -in_height: int, input image's height\n" + << " -in_width: int, input image's width\n"; + return 1; + } + int in_channels = atoi(argv[1]); + int in_height = atoi(argv[2]); + int in_width = atoi(argv[3]); + // kernel = 3, pad = 1, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 2 + LOG(paddle_mobile::kLOG_INFO) + << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 1, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 2, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 2 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 1, stride = 2 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 2 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 3, stride = 3 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 7, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 7, pad = 0, stride = 2 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 7, pad = 0, stride = 3 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 3, pad = 0, stride = 3 + LOG(paddle_mobile::kLOG_INFO) + << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 7, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 7, pad = 0, stride = 4 + LOG(paddle_mobile::kLOG_INFO) + << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); + // kernel = 5, pad = 0, stride = 1 + LOG(paddle_mobile::kLOG_INFO) + << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1"; + paddle_mobile::TestPoolOp(in_channels, in_height, + in_width); +} -- GitLab