From 67a2f0ea94cb8ea51cd8f84e7e0b53ff5aa8ea14 Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Tue, 4 Dec 2018 12:19:47 +0800
Subject: [PATCH] add int8_t type pooling support.

---
 .../kernel/central-arm-func/pool_arm_func.h   |  85 +--
 src/operators/math/pool_3x3.cpp               |   3 +-
 src/operators/math/pool_3x3.h                 |  24 +-
 src/operators/math/pool_3x3_int8.cpp          | 562 ++++++++++++++++++
 src/operators/math/pooling.cpp                |  10 +-
 src/operators/math/pooling.h                  |  36 +-
 test/CMakeLists.txt                           |   4 +-
 .../test_fusion_conv_add_relu_int8_op.cpp     |   2 +-
 test/operators/test_pool_op.cpp               | 283 ++++++++-
 9 files changed, 933 insertions(+), 76 deletions(-)
 create mode 100644 src/operators/math/pool_3x3_int8.cpp
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
index 941c237865..95149732ca 100644
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -23,20 +23,22 @@ namespace paddle_mobile {
 namespace operators {
 using framework::Tensor;
 
-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
+template <typename T, typename S>
+void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+               std::vector<int> strides, std::vector<int> paddings,
+               const Tensor *in_x, Tensor *out) {
   if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
+    math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
+    math::MaxPool<T> pool_process;
     pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
 
   } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
+    math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
+    math::AvgPool<T, S> pool_process;
     pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
   }
 }
+
 template <typename P>
 void PoolCompute(const PoolParam<CPU> &param) {
   const Tensor *in_x = param.Input();
@@ -52,50 +54,65 @@ void PoolCompute(const PoolParam<CPU> &param) {
     LOG(paddle_mobile::LogLevel::kLOG_ERROR)
         << "Pool op only supports 2D and 3D input.";
   }
-
   if (param.isGlobalPooling()) {
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[i] = 0;
       ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
     }
   }
-  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Maxs1p1(in_x, out);
+  if (in_x->type() == typeid(int8_t)) {
+    if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
+      if (strides[0] == strides[1] && strides[0] == 1) {
+        math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
+      } else if (strides[0] == strides[1] && strides[0] == 2) {
+        math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
       } else {
-        math::Pool3x3Max(strides, paddings, in_x, out);
-      }
-    } else if (pooling_type == "avg") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Avgs1p1(in_x, out);
-      } else {
-        math::Pool3x3Avg(strides, paddings, in_x, out);
+        math::Pool3x3Max_int8(strides, paddings, in_x, out);
       }
+    } else {
+      PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
+                                 out);
     }
+  } else {
+    if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+      if (pooling_type == "max") {
+        if (strides[0] == strides[1] && strides[0] == 1 &&
+            paddings[0] == paddings[1] && paddings[1] == 1) {
+          math::Pool3x3Maxs1p1(in_x, out);
+        } else {
+          math::Pool3x3Max(strides, paddings, in_x, out);
+        }
+      } else if (pooling_type == "avg") {
+        if (strides[0] == strides[1] && strides[0] == 1 &&
+            paddings[0] == paddings[1] && paddings[1] == 1) {
+          math::Pool3x3Avgs1p1(in_x, out);
+        } else {
+          math::Pool3x3Avg(strides, paddings, in_x, out);
+        }
+      }
 
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
-             strides[0] == strides[1] && paddings[0] == paddings[1] &&
-             paddings[1] == 0) {
+    } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
+               strides[0] == strides[1] && paddings[0] == paddings[1] &&
+               paddings[1] == 0) {
 #if __ARM_NEON
 #if __aarch64__
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x, out);
 #else
-    /// todo: fix bug in Pool2x2
-    if (pooling_type == "max") {
-      math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
-    }
+      /// todo: fix bug in Pool2x2
+      if (pooling_type == "max") {
+        math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
+      } else if (pooling_type == "avg") {
+        math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
+      }
 #endif
 #else
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x, out);
 #endif  // __ARM_NEON
 
-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+    } else {
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
+                              out);
+    }
   }
 }
 
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index dadb5a67cf..a2b84a5b14 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
   const int input_width = static_cast<int>(input->dims()[3]);
   const int output_height = static_cast<int>(output->dims()[2]);
   const int output_width = static_cast<int>(output->dims()[3]);
+  output->mutable_data<float>();
 
   const int hxw = input_height * input_width;
 
@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
   const int inputdata_channel_stride = h_in * w_in;
   const int input_batch_stride = output_channels * inputdata_channel_stride;
   const int output_batch_stride = output_channels * outputdata_channel_stride;
-  float *out_data = output->data<float>();
+  float *out_data = output->mutable_data<float>();
   const float *input_data = input->data<float>();
   for (int k = 0; k < batch_size; ++k) {
 #pragma omp parallel for
diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h
index ac1eb16a4c..a13cb6ab37 100644
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -28,15 +28,21 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-using framework::Tensor;
-using std::vector;
-void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
-void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
-void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output);
-
-void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
-                Tensor *out);
+void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output);
+void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output);
+void Pool3x3Max(std::vector<int> strides, std::vector<int> paddings,
+                const framework::Tensor *input, framework::Tensor *output);
+
+void Pool3x3Avg(std::vector<int> strides, std::vector<int> paddings,
+                const framework::Tensor *in_x, framework::Tensor *out);
+
+void Pool3x3Maxs1_int8(const framework::Tensor *input,
+                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
+void Pool3x3Maxs2_int8(const framework::Tensor *input,
+                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
+void Pool3x3Max_int8(const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const framework::Tensor *input, framework::Tensor *output);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_3x3_int8.cpp b/src/operators/math/pool_3x3_int8.cpp
new file mode 100644
index 0000000000..3698a2132b
--- /dev/null
+++ b/src/operators/math/pool_3x3_int8.cpp
@@ -0,0 +1,562 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "framework/tensor.h"
+#include "operators/math/pool_3x3.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+#include <climits>
+#include <iostream>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+template <typename T>
+static void make_paddings(const Tensor *input, Tensor *padded_input,
+                          int32_t top, int32_t bottom, int32_t left,
+                          int32_t right, T value) {
+  const int32_t batch_size = input->dims()[0];
+  const int32_t c_in = input->dims()[1];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int32_t h_padded = h_in + top + bottom;
+  const int32_t w_padded = w_in + left + right;
+  padded_input->Resize({batch_size, c_in, h_padded, w_padded});
+  T *padded_input_data = padded_input->mutable_data<T>();
+  const T *input_data = input->data<T>();
+  const int32_t input_channel_stride = h_in * w_in;
+  const int32_t input_batch_stride = c_in * input_channel_stride;
+  const int32_t padded_channel_stride = h_padded * w_padded;
+  const int32_t padded_batch_stride = c_in * padded_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < c_in; ++j) {
+      const T *img_in = input_data + j * input_channel_stride;
+      T *img_padded = padded_input_data + j * padded_channel_stride;
+      int k = 0;
+      for (; k < top; ++k) {
+        for (int l = 0; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+      for (; k < top + h_in; ++k) {
+        int l = 0;
+        for (; l < left; ++l) {
+          img_padded[l] = value;
+        }
+        memcpy(img_padded + left, img_in, w_in * sizeof(T));
+        l += w_in;
+        img_in += w_in;
+        for (; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+      for (; k < h_padded; ++k) {
+        for (int l = 0; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+    }
+    input_data += input_batch_stride;
+    padded_input_data += padded_batch_stride;
+  }
+  //  input_data = input->data<T>();
+  //  std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
+  //            << std::endl;
+  //  for (int i = 0; i < 1; ++i) {
+  //    for (int j = 0; j < 1; ++j) {
+  //      const T *img_in = input_data + j * input_channel_stride;
+  //      for (int k = 0; k < h_in; ++k) {
+  //        for (int l = 0; l < w_in; ++l) {
+  //          std::cout << (int32_t)*img_in << "\t";
+  //          img_in++;
+  //        }
+  //        std::cout << std::endl;
+  //      }
+  //    }
+  //    input_data += input_batch_stride;
+  //  }
+  //  std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
+  //  std::endl;
+  //
+  //  padded_input_data = padded_input->data<T>();
+  //  std::cout << "******************Padding begin**********************"
+  //            << std::endl;
+  //  for (int i = 0; i < 1; ++i) {
+  //    for (int j = 0; j < 1; ++j) {
+  //      T *img_padded = padded_input_data + j * padded_channel_stride;
+  //      for (int k = 0; k < h_padded; ++k) {
+  //        for (int l = 0; l < w_padded; ++l) {
+  //          std::cout << (int32_t)*img_padded << "\t";
+  //          img_padded++;
+  //        }
+  //        std::cout << std::endl;
+  //      }
+  //    }
+  //    padded_input_data += padded_batch_stride;
+  //  }
+  //  std::cout << "******************Padding end**********************"
+  //            << std::endl;
+}
+void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h,
+                       int32_t pad_w) {
+  Tensor padded_input;
+  if (pad_h != 0 && pad_w != 0) {
+    int8_t value = -SCHAR_MAX;
+    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
+    input = &padded_input;
+  }
+  const int32_t batch_size = input->dims()[0];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int8_t *input_data = input->data<int8_t>();
+  const int32_t output_channels = output->dims()[1];
+  const int32_t h_out = output->dims()[2];
+  int32_t w_out = output->dims()[3];
+  int8_t *output_data = output->mutable_data<int8_t>();
+  const int32_t outputdata_channel_stride = h_out * w_out;
+  const int32_t inputdata_channel_stride = h_in * w_in;
+  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
+  const int32_t output_batch_stride =
+      output_channels * outputdata_channel_stride;
+  //    std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < output_channels; ++j) {
+      const int8_t *img_in = input_data + j * inputdata_channel_stride;
+      int8_t *img_out = output_data + j * outputdata_channel_stride;
+      for (int k = 0; k < h_out; ++k) {
+        const int8_t *row0 = img_in + k * w_in;
+        const int8_t *row1 = img_in + (k + 1) * w_in;
+        const int8_t *row2 = img_in + (k + 2) * w_in;
+#if __ARM_NEON
+        int32_t nw = w_out >> 4;
+        int32_t left_w = w_out & 0xf;
+        int32_t nw1 = left_w >> 3;
+        int32_t left_w1 = left_w & 0x7;
+#if __aarch64__
+        // TODO(wzzju)
+#else
+        if (nw > 0) {
+#define LOOP_LABEL "1"
+          // result: q15
+          asm volatile(
+              "vld1.8 {q0}, [%[row0]]! \n\t"  // q0=0-15
+              "vld1.8 {q2}, [%[row1]]! \n\t"
+              "vld1.8 {q4}, [%[row2]]! \n\t"
+
+              LOOP_LABEL
+              ": \n\t"
+              "vld1.8 {q1}, [%[row0]]! \n\t"  // q1=16-31
+              "vext.8 q6, q0, q1, #1 \n\t"
+              "vext.8 q7, q0, q1, #2 \n\t"
+              "vld1.8 {q3}, [%[row1]]! \n\t"
+              "vmax.s8 q15, q0, q6 \n\t"
+              "vmax.s8 q15, q15, q7 \n\t"
+              "vext.8 q6, q2, q3, #1 \n\t"
+              "vext.8 q7, q2, q3, #2 \n\t"
+              "vld1.8 {q5}, [%[row2]]! \n\t"
+              "vmax.s8 q14, q2, q6 \n\t"
+              "vmax.s8 q14, q14, q7 \n\t"
+              "vext.8 q6, q4, q5, #1 \n\t"
+              "vext.8 q7, q4, q5, #2 \n\t"
+              "vmax.s8 q13, q4, q6 \n\t"
+              "vmax.s8 q13, q13, q7 \n\t"
+              "vmax.s8 q15, q15, q14 \n\t"
+              "vmax.s8 q15, q15, q13 \n\t"
+              "vmov.s8 q0, q1 \n\t"
+              "vmov.s8 q2, q3 \n\t"
+              "vmov.s8 q4, q5 \n\t"
+              "vst1.8 {q15}, [%[img_out]]! \n\t"
+              "subs %[nw], #1 \n\t"
+              "bne " LOOP_LABEL
+              "b \n\t"
+              "sub %[row0], #16 \n\t"
+              "sub %[row1], #16 \n\t"
+              "sub %[row2], #16 \n\t"
+              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
+                [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q13", "q14", "q15");
+#undef LOOP_LABEL
+        }
+        if (nw1 > 0 || left_w1 > 0) {
+#define PADDLE_LABEL_LESS8 "1"
+#define PADDLE_LABEL_LESS8_SAVE "2"
+#define PADDLE_LABEL_OVER "3"
+          // result: d15
+          asm volatile(
+              "vld1.8 {d0}, [%[row0]]! \n\t"  // d0=0-8
+              "vld1.8 {d2}, [%[row1]]! \n\t"
+              "vld1.8 {d4}, [%[row2]]! \n\t"
+              "mov r0, #1 \n\t"
+              "cmp %[nw1], #0 \n\t"
+              "beq " PADDLE_LABEL_LESS8
+              "f\n\t"
+              "vld1.8 {d1}, [%[row0]]! \n\t"  // d1=9-15
+              "vext.8 d6, d0, d1, #1 \n\t"
+              "vext.8 d7, d0, d1, #2 \n\t"
+              "vld1.8 {d3}, [%[row1]]! \n\t"
+              "vmax.s8 d15, d0, d6 \n\t"
+              "vmax.s8 d15, d15, d7 \n\t"
+              "vext.8 d6, d2, d3, #1 \n\t"
+              "vext.8 d7, d2, d3, #2 \n\t"
+              "vld1.8 {d5}, [%[row2]]! \n\t"
+              "vmax.s8 d14, d2, d6 \n\t"
+              "vmax.s8 d14, d14, d7 \n\t"
+              "vext.8 d6, d4, d5, #1 \n\t"
+              "vext.8 d7, d4, d5, #2 \n\t"
+              "vmax.s8 d13, d4, d6 \n\t"
+              "vmax.s8 d13, d13, d7 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+              "vmov.s8 d0, d1 \n\t"
+              "vmov.s8 d2, d3 \n\t"
+              "vmov.s8 d4, d5 \n\t"
+              "vst1.8 {d15}, [%[img_out]]! \n\t"
+
+              PADDLE_LABEL_LESS8
+              ": \n\t"
+              "cmp %[left_w1], #0 \n\t"
+              "beq " PADDLE_LABEL_OVER
+              "f\n\t"
+              "vld1.8 {d1}, [%[row0]] \n\t"  // d1=9-15
+              "vext.8 d6, d0, d1, #1 \n\t"
+              "vext.8 d7, d0, d1, #2 \n\t"
+              "vld1.8 {d3}, [%[row1]] \n\t"
+              "vmax.s8 d15, d0, d6 \n\t"
+              "vmax.s8 d15, d15, d7 \n\t"
+              "vext.8 d6, d2, d3, #1 \n\t"
+              "vext.8 d7, d2, d3, #2 \n\t"
+              "vld1.8 {d5}, [%[row2]] \n\t"
+              "vmax.s8 d14, d2, d6 \n\t"
+              "vmax.s8 d14, d14, d7 \n\t"
+              "vext.8 d6, d4, d5, #1 \n\t"
+              "vext.8 d7, d4, d5, #2 \n\t"
+              "vmax.s8 d13, d4, d6 \n\t"
+              "vmax.s8 d13, d13, d7 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+
+              PADDLE_LABEL_LESS8_SAVE
+              ": \n\t"
+              "vst1.8 {d15}, [%[img_out]], r0\n\t"
+              "add %[row0], %[row0], #1 \n\t"
+              "add %[row1], %[row1], #1 \n\t"
+              "add %[row2], %[row2], #1 \n\t"
+              "vext.8 d15, d15, d15, #1 \n\t"
+              "subs %[left_w1], #1 \n\t"
+              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
+
+              PADDLE_LABEL_OVER ": \n\t"
+              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
+                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+                "d7", "d13", "d14", "d15");
+#undef PADDLE_LABEL_OVER
+#undef PADDLE_LABEL_LESS8_SAVE
+#undef PADDLE_LABEL_LESS8
+        }
+#endif  // __aarch64__
+#else
+        int32_t left = w_out;
+        while (left > 0) {
+          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
+          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
+          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
+          *img_out = std::max(std::max(max0, max1), max2);
+          row0 += 1;
+          row1 += 1;
+          row2 += 1;
+          img_out++;
+          left--;
+        }
+#endif  // __ARM_NEON
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h,
+                       int32_t pad_w) {
+  Tensor padded_input;
+  if (pad_h != 0 && pad_w != 0) {
+    int8_t value = -SCHAR_MAX;
+    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
+    input = &padded_input;
+  }
+  const int32_t batch_size = input->dims()[0];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int32_t output_channels = output->dims()[1];
+  const int32_t h_out = output->dims()[2];
+  int32_t w_out = output->dims()[3];
+  const int32_t outputdata_channel_stride = h_out * w_out;
+  const int32_t inputdata_channel_stride = h_in * w_in;
+  const int32_t output_batch_stride =
+      output_channels * outputdata_channel_stride;
+  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
+  const int8_t *input_data = input->data<int8_t>();
+  int8_t *output_data = output->mutable_data<int8_t>();
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < output_channels; ++j) {
+      const int8_t *img_in = input_data + j * inputdata_channel_stride;
+      int8_t *img_out = output_data + j * outputdata_channel_stride;
+      for (int k = 0; k < h_out; ++k) {
+        const int8_t *row0 = img_in + 2 * k * w_in;
+        const int8_t *row1 = img_in + (2 * k + 1) * w_in;
+        const int8_t *row2 = img_in + (2 * k + 2) * w_in;
+#if __ARM_NEON
+        int32_t nw = w_out >> 4;
+        int32_t left_w = w_out & 0xf;
+        int32_t nw1 = left_w >> 3;
+        int32_t left_w1 = left_w & 0x7;
+#if __aarch64__
+        // TODO(wzzju)
+#else
+        if (nw > 0) {
+#define LOOP_LABEL "1"
+          // result: q15
+          asm volatile(
+              "vld2.8 {q0, q1}, [%[row0]]! \n\t"  // q0=0-30, q1=1-31
+              "vld2.8 {q2, q3}, [%[row1]]! \n\t"
+              "vld2.8 {q4, q5}, [%[row2]]! \n\t" LOOP_LABEL
+              ": \n\t"
+              "vmax.s8 q15, q0, q1 \n\t"
+              "vld2.8 {q6, q7}, [%[row0]]! \n\t"  // q0=32-62, q1=33-63
+              "vmax.s8 q14, q2, q3 \n\t"
+              "vmax.s8 q13, q4, q5 \n\t"
+              "vld2.8 {q8, q9}, [%[row1]]! \n\t"
+              "vext.8 q0, q0, q6, #1 \n\t"
+              "vmax.s8 q15, q15, q0 \n\t"
+              "vld2.8 {q10, q11}, [%[row2]]! \n\t"
+              "vext.8 q2, q2, q8, #1 \n\t"
+              "vmax.s8 q14, q14, q2 \n\t"
+              "vext.8 q4, q4, q10, #1 \n\t"
+              "vmax.s8 q13, q13, q4 \n\t"
+              "vmax.s8 q15, q15, q14 \n\t"
+              "vmax.s8 q15, q15, q13 \n\t"
+              "vmov.s8 q0, q6 \n\t"
+              "vmov.s8 q1, q7 \n\t"
+              "vmov.s8 q2, q8 \n\t"
+              "vmov.s8 q3, q9 \n\t"
+              "vmov.s8 q4, q10 \n\t"
+              "vmov.s8 q5, q11 \n\t"
+              "vst1.8 {q15}, [%[img_out]]! \n\t"
+              "subs %[nw], #1 \n\t"
+              "bne " LOOP_LABEL
+              "b \n\t"
+              "sub %[row0], #32 \n\t"
+              "sub %[row1], #32 \n\t"
+              "sub %[row2], #32 \n\t"
+              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
+                [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q13", "q14", "q15");
+#undef LOOP_LABEL
+        }
+        if (nw1 > 0 || left_w1 > 0) {
+#define PADDLE_LABEL_LESS8 "1"
+#define PADDLE_LABEL_LESS8_SAVE "2"
+#define PADDLE_LABEL_OVER "3"
+          // result: d15
+          asm volatile(
+              "vld2.8 {d0, d1}, [%[row0]]! \n\t"  // d0=0-14, d1=1-15
+              "vld2.8 {d2, d3}, [%[row1]]! \n\t"
+              "vld2.8 {d4, d5}, [%[row2]]! \n\t"
+              "mov r0, #1 \n\t"
+              "cmp %[nw1], #0 \n\t"
+              "beq " PADDLE_LABEL_LESS8
+              "f\n\t"
+              "vmax.s8 d15, d0, d1 \n\t"
+              "vld2.8 {d6, d7}, [%[row0]]! \n\t"  // d0=32-62, d1=33-63
+              "vmax.s8 d14, d2, d3 \n\t"
+              "vmax.s8 d13, d4, d5 \n\t"
+              "vld2.8 {d8, d9}, [%[row1]]! \n\t"
+              "vext.8 d0, d0, d6, #1 \n\t"
+              "vmax.s8 d15, d15, d0 \n\t"
+              "vld2.8 {d10, d11}, [%[row2]]! \n\t"
+              "vext.8 d2, d2, d8, #1 \n\t"
+              "vmax.s8 d14, d14, d2 \n\t"
+              "vext.8 d4, d4, d10, #1 \n\t"
+              "vmax.s8 d13, d13, d4 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+              "vmov.s8 d0, d6 \n\t"
+              "vmov.s8 d1, d7 \n\t"
+              "vmov.s8 d2, d8 \n\t"
+              "vmov.s8 d3, d9 \n\t"
+              "vmov.s8 d4, d10 \n\t"
+              "vmov.s8 d5, d11 \n\t"
+              "vst1.8 {d15}, [%[img_out]]! \n\t"
+
+              PADDLE_LABEL_LESS8
+              ": \n\t"
+              "cmp %[left_w1], #0 \n\t"
+              "beq " PADDLE_LABEL_OVER
+              "f\n\t"
+              "vmax.s8 d15, d0, d1 \n\t"
+              "vld2.8 {d6, d7}, [%[row0]] \n\t"  // d0=32-62, d1=33-63
+              "vmax.s8 d14, d2, d3 \n\t"
+              "vmax.s8 d13, d4, d5 \n\t"
+              "vld2.8 {d8, d9}, [%[row1]] \n\t"
+              "vext.8 d0, d0, d6, #1 \n\t"
+              "vmax.s8 d15, d15, d0 \n\t"
+              "vld2.8 {d10, d11}, [%[row2]] \n\t"
+              "vext.8 d2, d2, d8, #1 \n\t"
+              "vmax.s8 d14, d14, d2 \n\t"
+              "vext.8 d4, d4, d10, #1 \n\t"
+              "vmax.s8 d13, d13, d4 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+
+              PADDLE_LABEL_LESS8_SAVE
+              ": \n\t"
+              "vst1.8 {d15}, [%[img_out]], r0\n\t"
+              "add %[row0], %[row0], #2 \n\t"
+              "add %[row1], %[row1], #2 \n\t"
+              "add %[row2], %[row2], #2 \n\t"
+              "vext.8 d15, d15, d15, #1 \n\t"
+              "subs %[left_w1], #1 \n\t"
+              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
+
+              PADDLE_LABEL_OVER ": \n\t"
+              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
+                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+                "d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15");
+#undef PADDLE_LABEL_OVER
+#undef PADDLE_LABEL_LESS8_SAVE
+#undef PADDLE_LABEL_LESS8
+        }
+#endif  // __aarch64__
+#else
+        int32_t left = w_out;
+        while (left > 0) {
+          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
+          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
+          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
+          *img_out = std::max(std::max(max0, max1), max2);
+          row0 += 2;
+          row1 += 2;
+          row2 += 2;
+          img_out++;
+          left--;
+        }
+#endif  // __ARM_NEON
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+void Pool3x3Max_int8(const vector<int> &strides, const vector<int> &paddings,
+                     const Tensor *input, Tensor *output) {
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  //  const int _kernel_size = 3;
+  const int stride = strides[0];
+  //  const int stride_width = strides[1];
+  const int padding = paddings[0];
+  //  const int padding_width = paddings[1];
+  const int8_t negative_max = -SCHAR_MAX;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int8_t *input_data = input->data<int8_t>();
+  int8_t *output_data = output->mutable_data<int8_t>();
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int c = 0; c < output_channels; ++c) {
+      const int8_t *input_seg = input_data + c * input_channel_stride;
+      int8_t *output_seg = output_data + c * output_channel_stride;
+      for (int ph = 0; ph < output_height; ph++) {
+        int hstart = ph * stride - padding;
+        int hend = min(hstart + 3, input_height);
+        hstart = max(hstart, 0);
+        for (int pw = 0; pw < output_width; pw++) {
+          int wstart = pw * stride - padding;
+          int wend = min(wstart + 3, input_width);
+          wstart = max(wstart, 0);
+          const int8_t *pos1 = input_seg + hstart * input_width + wstart;
+          const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          int8_t *output_ptr = output_seg + ph * output_width + pw;
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            int8_t max_value = -SCHAR_MAX;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                int8_t value = input_seg[h * input_width + w];
+                if (value > max_value) {
+                  max_value = value;
+                }
+              }
+            }
+            output_seg[ph * output_width + pw] = max_value;
+          } else {
+#if __ARM_NEON
+#if __aarch64__
+          // TODO(wzzju)
+#else
+            asm volatile(
+                "vld1.8  {d0}, [%[pos1]]        \n\t"
+                "vld1.8  {d1}, [%[pos2]]        \n\t"
+                "vld1.8  {d2}, [%[pos3]]        \n\t"
+                "vmax.s8 d3, d0, d1            \n\t"
+                "vmax.s8 d4, d2, d3            \n\t"
+                "vmov.s8 d4[3],  %[negative_max] \n\t"
+                "vpmax.s8  d5, d4, d4            \n\t"
+                "vpmax.s8  d6, d5, d5             \n\t"
+                "vst1.8 {d6[0]},[%[output_ptr]]    \n\t"
+                :
+                : [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
+                : "memory", "q0", "q1", "q2", "q3");
+#endif
+#else
+            const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]);
+            const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]);
+            const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]);
+            *output_ptr = std::max(std::max(max0, max1), max2);
+#endif  // __ARM_NEON
+          }
+        }
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp
index f5bcdf7fdb..17df4a26aa 100644
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> {
             int wend = std::min(wstart + ksize_width, input_width);
             wstart = std::max(wstart, 0);
 
-            T ele = pool_process.initial();
+            auto ele = pool_process.initial();
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
             int pool_size = (hend - hstart) * (wend - wstart);
-            pool_process.finalize(static_cast<T>(pool_size), &ele);
-            output_data[ph * output_width + pw] = ele;
+            pool_process.finalize(static_cast<float>(pool_size), &ele);
+            output_data[ph * output_width + pw] = static_cast<T>(ele);
           }
         }
         input_data += input_stride;
@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> {
   }
 };
 
-template class PoolFunctor<CPU, math::AvgPool<float>, float>;
+template class PoolFunctor<CPU, math::AvgPool<float, float>, float>;
 template class PoolFunctor<CPU, math::MaxPool<float>, float>;
+template class PoolFunctor<CPU, math::AvgPool<int8_t, int32_t>, int8_t>;
+template class PoolFunctor<CPU, math::MaxPool<int8_t>, int8_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h
index 3ca868fa4d..4d94550cc3 100644
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <climits>
+#include <cmath>
 #include "common/log.h"
 #include "framework/tensor.h"
 #include "pool_2x2.h"
@@ -37,24 +39,42 @@ namespace math {
  * in pool pooling, and finally takes the average.
  *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
  */
-template <class T>
+template <typename T>
 class MaxPool {
  public:
-  inline T initial() { return static_cast<T>(-FLT_MAX); }
+  inline T initial() {
+    if (typeid(T) == typeid(int8_t)) {
+      return static_cast<T>(-SCHAR_MAX);
+    }
+    return static_cast<T>(-FLT_MAX);
+  }
 
   inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; }
 
   inline void finalize(const T &pool_field, T *y) {}
 };
 
-template <class T>
+template <typename Itype, typename Otype>
 class AvgPool {
  public:
-  inline T initial() { return static_cast<T>(0); }
-
-  inline void compute(const T &x, T *y) { *y += x; }
-
-  inline void finalize(const T &pool_field, T *y) { *y /= pool_field; }
+  inline Otype initial() { return static_cast<Otype>(0); }
+
+  inline void compute(const Itype &x, Otype *y) { *y += x; }
+
+  inline void finalize(const float &pool_field, Otype *y) {
+    if (typeid(Itype) == typeid(int8_t)) {
+      float tmp = *y / pool_field;
+      if (tmp > SCHAR_MAX) {
+        *y = SCHAR_MAX;
+      } else if (tmp < -SCHAR_MAX) {
+        *y = -SCHAR_MAX;
+      } else {
+        *y = static_cast<Otype>(std::round(tmp));
+      }
+    } else {
+      *y /= pool_field;
+    }
+  }
 };
 
 template <typename DeviceType, typename PoolProcess, typename T>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0f489995cb..3d202e2bd1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH)
 
 
     #gen test
-    ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-pool paddle-mobile)
+    ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-pool-op paddle-mobile)
 
     #gen test
     ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
diff --git a/test/operators/test_fusion_conv_add_relu_int8_op.cpp b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
index add38b34f1..42c68e5d04 100644
--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
@@ -251,7 +251,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
   attrs["groups"].Set<int>(1);
   attrs["axis"].Set<int>(0);
 
-  auto *op = new operators::FusionConvAddReluInt8Op<CPU, int8_t>(
+  auto *op = new operators::FusionConvAddReluInt8Op<CPU, T>(
       "fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
   op->InferShape();
   op->Init();
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index 09470caf82..48bf00c2ac 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -12,30 +12,279 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include "../test_include.h"
+#include "operators/kernel/central-arm-func/pool_arm_func.h"
 #include "operators/pool_op.h"
 
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_googlenet));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
+namespace paddle_mobile {
+static int PoolOutputSize(int input_size, int filter_size, int padding,
+                          int stride, bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
   }
+  return output_size;
+}
+
+template <typename T>
+static void PoolAvgPad0(std::vector<int> ksize, std::vector<int> strides,
+                        const framework::Tensor *input,
+                        framework::Tensor *out) {
+  const int32_t batch_size = input->dims()[0];
+  const int32_t input_c = input->dims()[1];
+  const int32_t input_h = input->dims()[2];
+  const int32_t input_w = input->dims()[3];
+  const int32_t out_c = out->dims()[1];
+  const int32_t out_h = out->dims()[2];
+  const int32_t out_w = out->dims()[3];
+  const int32_t kernel_h = ksize[0];
+  const int32_t kernel_w = ksize[1];
+  const int32_t stride_h = strides[0];
+  const int32_t stride_w = strides[1];
+  const int32_t inputdata_channel_stride = input_h * input_w;
+  const int32_t input_batch_stride = input_c * inputdata_channel_stride;
+  const int32_t outputdata_channel_stride = out_h * out_w;
+  const int32_t output_batch_stride = out_c * outputdata_channel_stride;
+  T *out_data = out->mutable_data<T>();
+  const T *input_data = input->data<T>();
+  const T **rows = new const T *[kernel_h];
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < out_c; ++j) {
+      const T *img_in = input_data + j * inputdata_channel_stride;
+      T *img_out = out_data + j * outputdata_channel_stride;
+      for (int k = 0; k < out_h; ++k) {
+        for (int m = 0; m < kernel_h; ++m) {
+          rows[m] = img_in + (stride_h * k + m) * input_w;
+        }
+        int32_t left = out_w;
+        while (left > 0) {
+          float tmp = 0;
+          for (int m = 0; m < kernel_h; ++m) {
+            for (int l = 0; l < kernel_w; ++l) {
+              tmp += rows[m][l];
+            }
+          }
+          if (typeid(T) == typeid(int8_t)) {
+            tmp = tmp / (kernel_h * kernel_w);
+            if (tmp < -127) {
+              *img_out = -127;
+            } else if (tmp > 127) {
+              *img_out = 127;
+            } else {
+              *img_out = static_cast<T>(std::round(tmp));
+            }
+          } else {
+            *img_out = static_cast<T>(tmp / (kernel_h * kernel_w));
+          }
+          for (int m = 0; m < kernel_h; ++m) {
+            rows[m] += stride_w;
+          }
+          img_out++;
+          left--;
+        }
+      }
+    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
+  }
+  delete[] rows;
+}
+
+template <typename T, int CeilMode, int PoolType, int Kernel, int Pad,
+          int Stride>
+int TestPoolOp(int in_channels, int in_height, int in_width) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  bool ceil_mode = CeilMode != 0;
+  std::string pooling_type = (PoolType == 0 ? "max" : "avg");
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+
+  std::vector<int64_t> output_shape_v({batch_size, input_c});
+  output_shape_v.push_back(
+      PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode));
+  output_shape_v.push_back(
+      PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode));
+
+  framework::DDim output_shape = framework::make_ddim(output_shape_v);
 
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::PoolOp<paddle_mobile::CPU, float>>
-      executor(program, "pool2d");
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  outputs["Out"] = std::vector<std::string>({"output"});
 
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 64, 112, 112}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
-  auto output =
-      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(input, input_shape, -127, 127);
 
-  float *output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["pooling_type"].SetString(pooling_type);
+  attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["ceil_mode"].Set<bool>(false);
+  attrs["global_pooling"].Set<bool>(false);
+
+  auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
+                                               scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+
+  framework::Tensor output_cmp;
+  output_cmp.mutable_data<T>(output_shape);
+  if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) {
+    PoolAvgPad0<T>(std::vector<int>{kernel_h, kernel_w},
+                   std::vector<int>{stride_h, stride_w}, input, &output_cmp);
+  } else {
+    if (typeid(T) == typeid(int8_t)) {
+      operators::PoolBasic<int8_t, int32_t>(
+          pooling_type, std::vector<int>{kernel_h, kernel_w},
+          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
+          input, &output_cmp);
+    } else {
+      operators::PoolBasic<float, float>(
+          pooling_type, std::vector<int>{kernel_h, kernel_w},
+          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
+          input, &output_cmp);
+    }
+  }
+
+  // compare results
+  int eq = 0;
+  int neq = 0;
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  T *output_cmp_data = output_cmp.data<T>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "The execution of test_pool_op is failed!");
+    if (output_data[i] == output_cmp_data[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
   }
+  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
+  delete op;
+
   return 0;
 }
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  if (argc < 4) {
+    LOG(paddle_mobile::kLOG_INFO)
+        << "Usage:\n"
+        << "  ./test-pool-op in_channels in_height in_width \n"
+        << "  params:\n"
+        << "   -in_channels: int, input image's channels\n"
+        << "   -in_height: int, input image's height\n"
+        << "   -in_width: int, input image's width\n";
+    return 1;
+  }
+  int in_channels = atoi(argv[1]);
+  int in_height = atoi(argv[2]);
+  int in_width = atoi(argv[3]);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 0, 3, 1, 1>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<float, 0, 0, 3, 0, 2>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 3, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 3, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 7, pad = 0, stride = 4
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4";
+  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
+                                                  in_width);
+}
-- 
GitLab