fix pool bug and speed, test=develop (#2385)

* fix pooling bug and speed * fix build error * delete VLOG in pool, test=develop * add openmp, test=develop * fix lite/kernels/arm/pool_compute_test basic_pooling compute error bug, test=develop

fix pool bug and speed, test=develop (#2385)
* fix pooling bug and speed * fix build error * delete VLOG in pool, test=develop * add openmp, test=develop * fix lite/kernels/arm/pool_compute_test basic_pooling compute error bug, test=develop
d197de00 · HappyAngel · Yuan Shuai · 52e0db46 · d197de00 · d197de00
6 changed file
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -116,6 +116,27 @@ void pooling3x3s2p1_max(const float* din,
                        int hin,
                        int win);
+void pooling3x3s1p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win);
+void pooling3x3s1p0_avg(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive);
 void pooling3x3s2p1_avg(const float* din,
                        float* dout,
                        int num,

--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -66,7 +66,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_max";
      return;
    } else if (pooling_type == "avg") {
      lite::arm::math::pooling_global_avg(din,
@@ -78,7 +77,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_ave";
      return;
    }
  } else {
@@ -93,7 +91,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-        VLOG(3) << "invoking pooling2x2s2_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling2x2s2_avg(din,
@@ -106,7 +103,6 @@ void PoolCompute::Run() {
                                          in_dims[2],
                                          in_dims[3],
                                          exclusive);
-        VLOG(3) << "invoking pooling2x2s2_avg";
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
@@ -121,7 +117,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "invokingpooling3x3s1p1_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s1p1_avg(din,
@@ -134,7 +129,32 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s1p1_avg";
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
+               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling3x3s1p0_max(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3]);
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling3x3s1p0_avg(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            exclusive);
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
@@ -149,7 +169,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "pooling3x3s2p0_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s2p0_avg(din,
@@ -162,7 +181,6 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p0_avg";
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
@@ -177,7 +195,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "invoking pooling3x3s2p1_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s2p1_avg(din,
@@ -190,7 +207,6 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p1_avg";
        return;
      }
    }
@@ -213,7 +229,6 @@ void PoolCompute::Run() {
                                 ceil_mode,
                                 use_quantizer,
                                 pooling_type);
-  VLOG(3) << "invoking pooling_basic";
 }
 }  // namespace arm

--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -68,8 +68,8 @@ void pool_compute_ref(const operators::PoolParam& param) {
  auto& in_dims = param.x->dims();
  auto& out_dims = param.output->dims();
-  const float* src_ptr = param.x->data<const float>();
+  const float* din = param.x->data<const float>();
-  float* dst_ptr = param.output->mutable_data<float>();
+  float* dout = param.output->mutable_data<float>();
  std::vector<int> ksize = param.ksize;
  std::vector<int> strides = param.strides;
@@ -83,84 +83,120 @@ void pool_compute_ref(const operators::PoolParam& param) {
  bool use_quantizer = param.use_quantizer;
  std::string data_format = param.data_format;
-  int in_n = in_dims[0];
+  int num = in_dims[0];
-  int in_c = in_dims[1];
+  int chin = in_dims[1];
-  int in_h = in_dims[2];
+  int hin = in_dims[2];
-  int in_w = in_dims[3];
+  int win = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-  int out_h = out_dims[2];
+  int chout = out_dims[1];
-  int out_w = out_dims[3];
+  int hout = out_dims[2];
-  int size_out_n = in_c * out_h * out_w;
+  int wout = out_dims[3];
-  int size_out_c = out_h * out_w;
-  int window_h = ksize[0];
+  // no need to pad input tensor, border is zero pad inside this function
-  int window_w = ksize[1];
+  memset(dout, 0, num * chout * hout * wout * sizeof(float));
+  int kernel_h = ksize[0];
+  int kernel_w = ksize[1];
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
  int pad_w = paddings[1];
+  int size_channel_in = win * hin;
-  if (global_pooling == true) {
+  int size_channel_out = wout * hout;
-    for (int n = 0; n < in_n; ++n) {
+  if (global_pooling) {
-      for (int c = 0; c < in_c; ++c) {
+    if (pooling_type == "max") {  // Pooling_max
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
+      for (int n = 0; n < num; ++n) {
-        float res = src[0];
+        float* dout_batch = dout + n * chout * size_channel_out;
-        if (pooling_type == "max") {
+        const float* din_batch = din + n * chin * size_channel_in;
-          for (int i = 1; i < size_in_c; ++i) {
+#pragma omp parallel for
-            float cur_val = src[i];
+        for (int c = 0; c < chout; ++c) {
-            res = cur_val > res ? cur_val : res;
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float tmp1 = din_ch[0];
+          for (int i = 0; i < size_channel_in; ++i) {
+            float tmp2 = din_ch[i];
+            tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
          }
-        } else if (pooling_type == "avg") {
+          dout_batch[c] = tmp1;
-          for (int i = 1; i < size_in_c; ++i) {
+        }
-            float cur_val = src[i];
+      }
-            res += cur_val;
+    } else if (pooling_type == "avg") {
+      // Pooling_average_include_padding
+      // Pooling_average_exclude_padding
+      for (int n = 0; n < num; ++n) {
+        float* dout_batch = dout + n * chout * size_channel_out;
+        const float* din_batch = din + n * chin * size_channel_in;
+#pragma omp parallel for
+        for (int c = 0; c < chout; ++c) {
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float sum = 0.f;
+          for (int i = 0; i < size_channel_in; ++i) {
+            sum += din_ch[i];
          }
-          res /= size_in_c;
+          dout_batch[c] = sum / size_channel_in;
        }
-        dst_ptr[n * size_out_n + c] = res;
      }
+    } else {
+      LOG(FATAL) << "unsupported pooling type: " << pooling_type;
    }
  } else {
-    for (int n = 0; n < in_n; ++n) {
+    for (int ind_n = 0; ind_n < num; ++ind_n) {
-      for (int c = 0; c < in_c; ++c) {
+#pragma omp parallel for
-        for (int h = 0; h < out_h; ++h) {
+      for (int ind_c = 0; ind_c < chin; ++ind_c) {
-          int sh = h * stride_h;
+        for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int eh = sh + window_h;
+          int sh = ind_h * stride_h;
+          int eh = sh + kernel_h;
          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
+          for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = w * stride_w;
+            int sw = ind_w * stride_w;
-            int ew = sw + window_w;
+            int ew = sw + kernel_w;
            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+            ew = (ew - pad_w) > win ? win : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
+            float result = static_cast<float>(0);
-            if (pooling_size == 0) continue;
+            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
-            float res = 0.f;
+                          ind_h * wout + ind_w;
            for (int kh = sh; kh < eh; ++kh) {
              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                int src_ind =
+                    (ind_n * chin + ind_c) * size_channel_in + kh * win + kw;
                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
+                  result = din[src_ind];
                } else {
                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                    result = result >= din[src_ind] ? result : din[src_ind];
-                  }
+                  } else if (pooling_type == "avg") {
-                  if (pooling_type == "avg") {
+                    result += din[src_ind];
-                    res += src_ptr[src_idx];
                  }
                }
              }
            }
            if (pooling_type == "avg") {
              if (exclusive) {
-                res /= pooling_size;
+                int div = (ew - sw) * (eh - sh);
+                div = div > 0 ? div : 1;
+                result /= div;
              } else {
-                res /= window_h * window_w;
+                int bh = kernel_h;
+                int bw = kernel_w;
+                if (ew == win) {
+                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
+                                                    : sw + kernel_w;
+                  bw -= sw;
+                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                    bw += pad_w;
+                  }
+                }
+                if (eh == hin) {
+                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
+                                                    : sh + kernel_h;
+                  bh -= sh;
+                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                    bh += pad_h;
+                  }
+                }
+                result /= bh * bw;
              }
            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+            dout[dst_ind] = result;
          }
        }
      }

--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -5,4 +5,5 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#include "lite/tests/utils/tensor_utils.h"
+#include "lite/tests/utils/timer.h"
+#ifdef LITE_WITH_ARM
+#include "lite/kernels/arm/pool_compute.h"
+#endif  // LITE_WITH_ARM
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+DEFINE_int32(batch, 1, "batch size");
+DEFINE_int32(in_channel, 32, "input channel");
+DEFINE_int32(in_height, 112, "input height");
+DEFINE_int32(in_width, 112, "input width");
+DEFINE_int32(kernel_h, 3, "kernel height");
+DEFINE_int32(kernel_w, 3, "kernel width");
+DEFINE_int32(pad_h, 1, "pad height");
+DEFINE_int32(pad_w, 1, "pad width");
+DEFINE_int32(stride_h, 1, "stride height");
+DEFINE_int32(stride_w, 1, "stride width");
+DEFINE_bool(ceil_mode, true, "do ceil_mode");
+DEFINE_bool(flag_global, true, "global pooling");
+DEFINE_bool(exclusive, true, "do exclusive");
+DEFINE_bool(adaptive, false, "no do adaptive");
+DEFINE_bool(use_quantizer, false, "no do use_quantizer");
+DEFINE_string(pooling_type, "max", "do max pooling");
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::operators::PoolParam PoolParam;
+using paddle::lite::Timer;
+DDim compute_out_dim(const DDim& dim_in,
+                     const paddle::lite::operators::PoolParam& param) {
+  DDim dim_out = dim_in;
+  auto kernel_h = param.ksize[0];
+  auto kernel_w = param.ksize[1];
+  auto h = dim_in[2];
+  auto w = dim_in[3];
+  int pad_h = param.paddings[0];
+  int pad_w = param.paddings[1];
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  bool ceil_mode = param.ceil_mode;
+  bool flag_global = param.global_pooling;
+  int hout = 1;
+  int wout = 1;
+  if (!flag_global) {
+    if (!ceil_mode) {
+      hout = (h - kernel_h + 2 * pad_h) / stride_h + 1;
+      wout = (w - kernel_w + 2 * pad_w) / stride_w + 1;
+    } else {
+      hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1;
+      wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1;
+    }
+  }
+  dim_out[2] = hout;
+  dim_out[3] = wout;
+  return dim_out;
+}
+void pooling_basic(const float* din,
+                   float* dout,
+                   int num,
+                   int chout,
+                   int hout,
+                   int wout,
+                   int chin,
+                   int hin,
+                   int win,
+                   const std::vector<int>& ksize,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   bool global_pooling,
+                   bool exclusive,
+                   bool adaptive,
+                   bool ceil_mode,
+                   bool use_quantizer,
+                   const std::string& pooling_type) {
+  // no need to pad input tensor, border is zero pad inside this function
+  memset(dout, 0, num * chout * hout * wout * sizeof(float));
+  int kernel_h = ksize[0];
+  int kernel_w = ksize[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+  int size_channel_in = win * hin;
+  int size_channel_out = wout * hout;
+  if (global_pooling) {
+    if (pooling_type == "max") {  // Pooling_max
+      for (int n = 0; n < num; ++n) {
+        float* dout_batch = dout + n * chout * size_channel_out;
+        const float* din_batch = din + n * chin * size_channel_in;
+#pragma omp parallel for
+        for (int c = 0; c < chout; ++c) {
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float tmp1 = din_ch[0];
+          for (int i = 0; i < size_channel_in; ++i) {
+            float tmp2 = din_ch[i];
+            tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
+          }
+          dout_batch[c] = tmp1;
+        }
+      }
+    } else if (pooling_type == "avg") {
+      // Pooling_average_include_padding
+      // Pooling_average_exclude_padding
+      for (int n = 0; n < num; ++n) {
+        float* dout_batch = dout + n * chout * size_channel_out;
+        const float* din_batch = din + n * chin * size_channel_in;
+#pragma omp parallel for
+        for (int c = 0; c < chout; ++c) {
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float sum = 0.f;
+          for (int i = 0; i < size_channel_in; ++i) {
+            sum += din_ch[i];
+          }
+          dout_batch[c] = sum / size_channel_in;
+        }
+      }
+    } else {
+      LOG(FATAL) << "unsupported pooling type: " << pooling_type;
+    }
+  } else {
+    for (int ind_n = 0; ind_n < num; ++ind_n) {
+#pragma omp parallel for
+      for (int ind_c = 0; ind_c < chin; ++ind_c) {
+        for (int ind_h = 0; ind_h < hout; ++ind_h) {
+          int sh = ind_h * stride_h;
+          int eh = sh + kernel_h;
+          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          for (int ind_w = 0; ind_w < wout; ++ind_w) {
+            int sw = ind_w * stride_w;
+            int ew = sw + kernel_w;
+            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+            ew = (ew - pad_w) > win ? win : ew - pad_w;
+            float result = static_cast<float>(0);
+            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
+                          ind_h * wout + ind_w;
+            for (int kh = sh; kh < eh; ++kh) {
+              for (int kw = sw; kw < ew; ++kw) {
+                int src_ind =
+                    (ind_n * chin + ind_c) * size_channel_in + kh * win + kw;
+                if (kh == sh && kw == sw) {
+                  result = din[src_ind];
+                } else {
+                  if (pooling_type == "max") {
+                    result = result >= din[src_ind] ? result : din[src_ind];
+                  } else if (pooling_type == "avg") {
+                    result += din[src_ind];
+                  }
+                }
+              }
+            }
+            if (pooling_type == "avg") {
+              if (exclusive) {
+                int div = (ew - sw) * (eh - sh);
+                div = div > 0 ? div : 1;
+                result /= div;
+              } else {
+                int bh = kernel_h;
+                int bw = kernel_w;
+                if (ew == win) {
+                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
+                                                    : sw + kernel_w;
+                  bw -= sw;
+                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                    bw += pad_w;
+                  }
+                }
+                if (eh == hin) {
+                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
+                                                    : sh + kernel_h;
+                  bh -= sh;
+                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                    bh += pad_h;
+                  }
+                }
+                result /= bh * bw;
+              }
+            }
+            dout[dst_ind] = result;
+          }
+        }
+      }
+    }
+  }
+}
+#ifdef LITE_WITH_ARM
+void test_pool_fp32(const std::vector<DDim>& input_dims,
+                    const std::vector<int>& ksize,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& pads,
+                    bool ceil_mode,
+                    bool flag_global,
+                    bool exclusive,
+                    bool adaptive,
+                    bool use_quantizer,
+                    std::string pooling_type,
+                    const std::vector<int>& thread_num,
+                    const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  PoolParam param;
+  param.x = new Tensor;
+  param.x->set_precision(PRECISION(kFloat));
+  param.ksize = ksize;
+  param.strides = strides;
+  param.paddings = pads;
+  param.ceil_mode = ceil_mode;
+  param.global_pooling = flag_global;
+  param.pooling_type = pooling_type;
+  param.exclusive = exclusive;
+  param.adaptive = adaptive;
+  param.use_quantizer = use_quantizer;
+  param.output = new Tensor;
+  param.output->set_precision(PRECISION(kFloat));
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::PoolCompute pool;
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      pool.SetParam(param);
+      pool.SetContext(std::move(ctx1));
+      /// prepare for run
+      pool.PrepareForRun();
+      for (auto& dim_in : input_dims) {
+        DDim dim_out = compute_out_dim(dim_in, param);
+        if (dim_out[2] < 1 || dim_out[3] < 1) {
+          continue;
+        }
+        param.x->Resize(dim_in);
+        param.output->Resize(dim_out);
+        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+        //        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        auto din = param.x->data<float>();
+        Tensor tout_basic;
+        if (FLAGS_check_result) {
+          LOG(INFO) << "basic compute";
+          tout_basic.set_precision(PRECISION(kFloat));
+          tout_basic.Resize(dim_out);
+          fill_tensor_const(tout_basic, 0.f);
+          auto dout_basic = tout_basic.mutable_data<float>();
+          pooling_basic(din,
+                        dout_basic,
+                        dim_in[0],
+                        dim_out[1],
+                        dim_out[2],
+                        dim_out[3],
+                        dim_in[1],
+                        dim_in[2],
+                        dim_in[3],
+                        ksize,
+                        strides,
+                        pads,
+                        flag_global,
+                        exclusive,
+                        adaptive,
+                        ceil_mode,
+                        use_quantizer,
+                        pooling_type);
+        }
+        LOG(INFO) << "lite compute";
+        /// warm up
+        for (int i = 0; i < FLAGS_warmup; ++i) {
+          pool.Launch();
+        }
+        /// compute
+        Timer t0;
+        for (int i = 0; i < FLAGS_repeats; ++i) {
+          t0.start();
+          pool.Launch();
+          t0.end();
+        }
+        double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
+        LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
+                  << dim_out << ", running time, avg: " << t0.get_average_ms()
+                  << ", min time: " << t0.get_min_time()
+                  << ", total GOPS: " << 1e-9 * gops
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+        if (FLAGS_check_result) {
+          double max_ratio = 0;
+          double max_diff = 0;
+          tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff);
+          LOG(INFO) << "compare result, max diff: " << max_diff
+                    << ", max ratio: " << max_ratio;
+          if (std::abs(max_ratio) > 1e-3f) {
+            if (max_diff > 5e-4f) {
+              LOG(WARNING) << "din";
+              print_tensor(*param.x);
+              LOG(WARNING) << "basic result";
+              print_tensor(tout_basic);
+              LOG(WARNING) << "lite result";
+              print_tensor(*param.output);
+              Tensor tdiff;
+              tdiff.Resize(tout_basic.dims());
+              tdiff.set_precision(PRECISION(kFloat));
+              tensor_diff(tout_basic, *param.output, tdiff);
+              print_tensor(tdiff);
+              LOG(FATAL) << "test fp32 pool: input: " << dim_in
+                         << ", output: " << dim_out
+                         << ", kernel dim: " << ksize[0] << ", " << ksize[1]
+                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", stride: " << strides[0] << ", " << strides[1]
+                         << ", global_pooling: "
+                         << (flag_global ? "global" : "false")
+                         << ", pooling_type: " << pooling_type
+                         << ", ceil_mode: " << (ceil_mode ? "true" : "false")
+                         << ", exclusive: " << (exclusive ? "true" : "false")
+                         << ", threads: " << th << ", power_mode: " << cls
+                         << " failed!!\n";
+            }
+          }
+        }
+        LOG(INFO) << "test fp32 pool: input: " << dim_in
+                  << ", output: " << dim_out << ", kernel dim: " << ksize[0]
+                  << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1]
+                  << ", stride: " << strides[0] << ", " << strides[1]
+                  << ", global_pooling: " << (flag_global ? "global" : "false")
+                  << ", pooling_type: " << pooling_type
+                  << ", ceil_mode: " << (ceil_mode ? "true" : "false")
+                  << ", exclusive: " << (exclusive ? "true" : "false")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+  delete param.x;
+  delete param.output;
+}
+#else
+void test_pool_fp32(const std::vector<DDim>& input_dims,
+                    const std::vector<int>& ksize,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& pads,
+                    bool ceil_mode,
+                    bool flag_global,
+                    bool exclusive,
+                    bool adaptive,
+                    bool use_quantizer,
+                    std::string pooling_type,
+                    const std::vector<int>& thread_num,
+                    const std::vector<int>& power_mode) {}
+#endif  // LITE_WITH_ARM
+#if 1  /// random param pool
+TEST(TestPoolRand, test_pool_rand) {
+  if (FLAGS_basic_test) {
+    for (auto& cin : {1, 3, 8, 16}) {
+      for (auto& kw : {1, 2, 3}) {
+        for (auto& kh : {1, 2, 3}) {
+          for (auto& stride : {1, 2}) {
+            for (auto& pad : {0, 1, 2}) {
+              for (auto& flag_global : {false, true}) {
+                for (auto& exclusive : {false, true}) {
+                  for (auto& ceil_mode : {false, true}) {
+                    for (auto& pooling_type : {"max", "avg"}) {
+                      bool adaptive = false;
+                      bool use_quantizer = false;
+                      std::vector<DDim> dims;
+                      for (auto& batch : {1, 2}) {
+                        for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
+                          dims.push_back(DDim({batch, cin, h, h}));
+                        }
+                      }
+                      test_pool_fp32(dims,
+                                     {kh, kw},
+                                     {stride, stride},
+                                     {pad, pad},
+                                     ceil_mode,
+                                     flag_global,
+                                     exclusive,
+                                     adaptive,
+                                     use_quantizer,
+                                     pooling_type,
+                                     {1, 2, 4},
+                                     {FLAGS_power_mode});
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif  /// random param conv
+#if 1  /// custom
+TEST(TesPoolCustom, test_pool_fp32_custom_size) {
+  test_pool_fp32(
+      {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
+      {FLAGS_kernel_h, FLAGS_kernel_w},
+      {FLAGS_stride_h, FLAGS_stride_w},
+      {FLAGS_pad_h, FLAGS_pad_w},
+      FLAGS_ceil_mode,
+      FLAGS_flag_global,
+      FLAGS_exclusive,
+      FLAGS_adaptive,
+      FLAGS_use_quantizer,
+      FLAGS_pooling_type,
+      {FLAGS_threads},
+      {FLAGS_power_mode});
+}
+#endif  // custom