[LITE][ARM] Add pool operator of arm cpu. test=develop

e1a9d563 · 开心的小妮 · Tensor Tang · e6c158fb · e1a9d563 · e1a9d563
12 changed file
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -9,6 +9,7 @@ cc_library(math_arm SRCS
    packed_sgemm.cc 
    softmax.cc 
    scale.cc
+    pooling.cc
    elementwise.cc
    sgemv.cc
    type_trans.cpp

--- a/paddle/fluid/lite/arm/math/pooling.cc
+++ b/paddle/fluid/lite/arm/math/pooling.cc
--- a/paddle/fluid/lite/arm/math/pooling.h
+++ b/paddle/fluid/lite/arm/math/pooling.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+// !pooling fp32 Op
+void pooling_basic(const void* din, void* dout, int num, int chout, int hout,
+                   int wout, int chin, int hin, int win,
+                   const std::vector<int>& ksize,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings, bool global_pooling,
+                   bool exclusive, bool adaptive, bool ceil_mode,
+                   bool use_quantizer, const std::string& pooling_type);
+void pooling_global(const void* din, void* dout, int num, int chout, int hout,
+                    int wout, int chin, int hin, int win,
+                    const std::vector<int>& ksize,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings, bool global_pooling,
+                    bool exclusive, bool adaptive, bool ceil_mode,
+                    bool use_quantizer, const std::string& pooling_type);
+void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -11,12 +11,14 @@ cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
 lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 set(arm_kernels
    fc_compute_arm
@@ -26,6 +28,7 @@ set(arm_kernels
    softmax_compute_arm
    conv_compute_arm
    elementwise_add_compute_arm
+    pool_compute_arm
    )
 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void PoolCompute::Run() {
+  auto& param = Param<operators::PoolParam>();
+  auto& in_dims = param.x->dims();
+  auto& out_dims = param.output->dims();
+  const float* din = param.x->data<float>();
+  float* dout = param.output->mutable_data<float>();
+  std::vector<int>& ksize = param.ksize;
+  std::vector<int>& strides = param.strides;
+  std::vector<int>& paddings = param.paddings;
+  std::string& pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+  bool ceil_mode = param.ceil_mode;
+  bool use_quantizer = param.use_quantizer;
+  std::string& data_format = param.data_format;
+  if (param.global_pooling) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_dims[i + 2]);
+    }
+  }
+#if 0
+  for (int i = 0; i < in_dims.size(); ++i) {
+    LOG(INFO) << "in_dims[" << i << "]:" << in_dims[i];
+  }
+  for (int i = 0; i < out_dims.size(); ++i) {
+    LOG(INFO) << "out_dims[" << i << "]:" << out_dims[i];
+  }
+  for (int i = 0; i < ksize.size(); ++i) {
+    LOG(INFO) << "ksize[" << i << "]:" << ksize[i];
+  }
+  for (int i = 0; i < strides.size(); ++i) {
+    LOG(INFO) << "strides[" << i << "]:" << strides[i];
+  }
+  for (int i = 0; i < paddings.size(); ++i) {
+    LOG(INFO) << "paddings[" << i << "]:" << paddings[i];
+  }
+  LOG(INFO) << "global_pooling:" << global_pooling;
+  LOG(INFO) << "exclusive:" << exclusive;
+  LOG(INFO) << "adaptive:" << adaptive;
+  LOG(INFO) << "ceil_mode:" << ceil_mode;
+  LOG(INFO) << "use_quantizer:" << use_quantizer;
+  LOG(INFO) << "data_format:" << data_format;
+  LOG(INFO) << "din:" << din;
+  LOG(INFO) << "dout:" << dout;
+#endif
+  // global
+  if (global_pooling == true) {
+    lite::arm::math::pooling_global(
+        din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+        in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+        global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+        pooling_type);
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
+             strides[0] == strides[1]) {
+    if (pooling_type == "max") {
+      lite::arm::math::pooling2x2s2_max(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    } else if (pooling_type == "avg") {
+      lite::arm::math::pooling2x2s2_ave(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 1 &&
+             strides[0] == strides[1] && paddings[0] == 1) {
+    if (pooling_type == "max") {
+      lite::arm::math::pooling3x3s1p1_max(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    } else if (pooling_type == "avg") {
+      lite::arm::math::pooling3x3s1p1_ave(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 2 &&
+             strides[0] == strides[1] && paddings[0] == 0) {
+    if (pooling_type == "max") {
+      lite::arm::math::pooling3x3s2p0_max(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    } else if (pooling_type == "avg") {
+      lite::arm::math::pooling3x3s2p0_ave(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1] && strides[0] == 2 &&
+             strides[0] == strides[1] && paddings[0] == 1) {
+    if (pooling_type == "max") {
+      lite::arm::math::pooling3x3s2p1_max(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    } else if (pooling_type == "avg") {
+      lite::arm::math::pooling3x3s2p1_ave(
+          din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+          in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+          global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+          pooling_type);
+    }
+  } else {
+    lite::arm::math::pooling_basic(
+        din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+        in_dims[1], in_dims[2], in_dims[3], ksize, strides, paddings,
+        global_pooling, exclusive, adaptive, ceil_mode, use_quantizer,
+        pooling_type);
+  }
+  return;
+}
+TargetType PoolCompute::target() const { return TARGET(kARM); }
+PrecisionType PoolCompute::precision() const { return PRECISION(kFloat); }
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(pool, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::PoolCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/pool_compute.h
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/pool_op.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class PoolCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  void Run() override;
+  TargetType target() const override;
+  PrecisionType precision() const override;
+  virtual ~PoolCompute() = default;
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void pool_compute_ref(const operators::PoolParam& param) {
+  auto& in_dims = param.x->dims();
+  auto& out_dims = param.output->dims();
+  const float* src_ptr = param.x->data<const float>();
+  float* dst_ptr = param.output->mutable_data<float>();
+  std::vector<int> ksize = param.ksize;
+  std::vector<int> strides = param.strides;
+  std::vector<int> paddings = param.paddings;
+  std::string pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+  bool ceil_mode = param.ceil_mode;
+  bool use_quantizer = param.use_quantizer;
+  std::string data_format = param.data_format;
+  int in_n = in_dims[0];
+  int in_c = in_dims[1];
+  int in_h = in_dims[2];
+  int in_w = in_dims[3];
+  int size_in_n = in_c * in_h * in_w;
+  int size_in_c = in_h * in_w;
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+  int size_out_n = in_c * out_h * out_w;
+  int size_out_c = out_h * out_w;
+  int window_h = ksize[0];
+  int window_w = ksize[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+  if (global_pooling == true) {
+    ksize[0] = in_h;
+    ksize[1] = in_w;
+  }
+#if 0
+  for (int i = 0; i < ksize.size(); ++i) {
+    LOG(INFO) << "ksize[" << i << "]:" << ksize[i];
+  }
+  for (int i = 0; i < strides.size(); ++i) {
+    LOG(INFO) << "strides[" << i << "]:" << strides[i];
+  }
+  for (int i = 0; i < paddings.size(); ++i) {
+    LOG(INFO) << "paddings[" << i << "]:" << paddings[i];
+  }
+  LOG(INFO) << "in nchw:" << in_n << ", " << in_c << ", " << in_h << ", "
+            << in_w;
+  LOG(INFO) << "size_in_n:" << size_in_n;
+  LOG(INFO) << "size_out_c:" << size_out_c;
+  LOG(INFO) << "out_h:" << out_h;
+  LOG(INFO) << "out_w:" << out_w;
+  LOG(INFO) << "size_out_n:" << size_out_n;
+  LOG(INFO) << "size_out_c:" << size_out_c;
+  LOG(INFO) << "window_h:" << window_h;
+  LOG(INFO) << "window_w:" << window_w;
+  LOG(INFO) << "stride_h:" << stride_h;
+  LOG(INFO) << "stride_w:" << stride_w;
+  LOG(INFO) << "pad_h:" << pad_h;
+  LOG(INFO) << "pad_w:" << pad_w;
+#endif
+  for (int ind_n = 0; ind_n < in_n; ++ind_n) {
+    for (int ind_c = 0; ind_c < in_c; ++ind_c) {
+      for (int ind_h = 0; ind_h < out_h; ++ind_h) {
+        int sh = ind_h * stride_h;
+        int eh = sh + window_h;
+        sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+        eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+        for (int ind_w = 0; ind_w < out_w; ++ind_w) {
+          int sw = ind_w * stride_w;
+          int ew = sw + window_w;
+          sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+          ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+          float result = static_cast<float>(0);
+          int dst_ind =
+              ind_n * size_out_n + ind_c * size_out_c + ind_h * out_w + ind_w;
+          for (int kh = sh; kh < eh; ++kh) {
+            for (int kw = sw; kw < ew; ++kw) {
+              int src_ind =
+                  ind_n * size_in_n + ind_c * size_in_c + kh * in_w + kw;
+              if (kh == sh && kw == sw) {
+                result = src_ptr[src_ind];
+              } else {
+                if (pooling_type == "max") {
+                  result =
+                      result >= src_ptr[src_ind] ? result : src_ptr[src_ind];
+                }
+                if (pooling_type == "avg" && exclusive == false) {
+                  // Pooling_average_include_padding
+                  result += src_ptr[src_ind];
+                }
+                if (pooling_type == "avg" && exclusive == true) {
+                  // Pooling_average_include_padding
+                  result += src_ptr[src_ind];
+                }
+              }
+            }
+          }
+          if (pooling_type == "avg" && exclusive == false) {
+            // Pooling_average_include_padding
+            // result /= param.window_h * param.window_w;
+            // LOG(ERROR)<<"cpu"<<param.window_h * param.window_w;
+            int bh = window_h;
+            int bw = window_w;
+            if (ew == in_w) {
+              bw = sw + window_w >= in_w + pad_w ? in_w + pad_w : sw + window_w;
+              bw -= sw;
+            }
+            if (eh == in_h) {
+              bh = sh + window_h >= in_h + pad_h ? in_h + pad_h : sh + window_h;
+              bh -= sh;
+            }
+            result /= bh * bw;
+          }
+          if (pooling_type == "avg" && exclusive == true) {
+            // Pooling_average_exclude_padding
+            result /= (ew - sw) * (eh - sh);
+          }
+          dst_ptr[dst_ind] = result;
+        }
+      }
+    }
+  }
+}
+TEST(pool_arm, init) {
+  PoolCompute pool;
+  ASSERT_EQ(pool.precision(), PRECISION(kFloat));
+  ASSERT_EQ(pool.target(), TARGET(kARM));
+}
+TEST(pool_arm, compute) {
+  PoolCompute pool;
+  operators::PoolParam param;
+  lite::Tensor x;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  for (auto pooling_type : {"avg", "max"}) {
+    for (auto global_pooling : {true}) {
+      for (auto stride : {2}) {
+        for (auto pad : {0}) {
+          for (auto n : {1, 3, 4, 11}) {
+            for (auto c : {1, 3, 11, 4, 1024}) {
+              for (auto h : {3, 1, 11, 4, 1}) {
+                for (auto w : {1, 3, 4, 12, 1}) {
+                  LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                            << " w:" << w << " stride:" << stride
+                            << " pad:" << pad
+                            << " pooling_type:" << pooling_type
+                            << " global_pooling:" << global_pooling;
+                  // init x, output
+                  x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+                  output.Resize(DDim(std::vector<int64_t>({n, c, 1, 1})));
+                  output_ref.Resize(DDim(std::vector<int64_t>({n, c, 1, 1})));
+                  auto* x_data = x.mutable_data<float>();
+                  for (int i = 0; i < x.dims().production(); ++i) {
+                    x_data[i] = i;
+                  }
+                  // fill param
+                  param.x = &x;
+                  param.output = &output;
+                  param.pooling_type = pooling_type;
+                  param.ksize = {h, w};
+                  param.global_pooling = global_pooling;
+                  param.strides = {stride, stride};
+                  param.paddings = {pad, pad};
+                  param.exclusive = true;
+                  param.adaptive = false;
+                  param.ceil_mode = false;
+                  param.use_quantizer = false;
+                  // compute
+                  pool.SetParam(param);
+                  pool.Run();
+#if 0
+          LOG(INFO) << "n:" << n << " c:" << c << " h:" << h << " w:" << w
+                    << " end";
+          std::cout << "n:" << n << " c:" << c << " h:" << h << " w:" << w
+                    << " end" << std::endl;
+          for (int i = 0; i < param.ksize.size(); ++i) {
+            std::cout << " ksize[" << i << "]:" << param.ksize[i];
+          }
+          std::cout << "\n";
+          for (int i = 0; i < param.strides.size(); ++i) {
+            std::cout << " strides[" << i << "]:" << param.strides[i];
+          }
+          std::cout << "\n";
+          for (int i = 0; i < param.paddings.size(); ++i) {
+            std::cout << " paddings[" << i << "]:" << param.paddings[i];
+          }
+          std::cout << "\n";
+#endif
+                  // compute ref
+                  // output_ref.Resize(output.dims());
+                  param.output = &output_ref;
+                  pool_compute_ref(param);
+                  LOG(INFO) << "pool_compute_ref(param) end";
+                  // compare
+                  auto* output_data = output.mutable_data<float>();
+                  auto* output_ref_data = output_ref.mutable_data<float>();
+                  for (int i = 0; i < output.dims().production(); i++) {
+                    EXPECT_NEAR(output_data[i], output_ref_data[i],
+                                1);  // 1e-5);
+                  }
+                  LOG(INFO) << "compare pass";
+                }
+              }
+            }
+          }
+        }  // pad
+      }    // stride
+    }      // global_pooling
+  }        // pooling_type
+}
+TEST(pool, retrive_op) {
+  auto pool =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("pool");
+  ASSERT_FALSE(pool.empty());
+  ASSERT_TRUE(pool.front());
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
@@ -19,5 +19,6 @@ USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
 cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
 cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
+cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS})
 set(ops_lite
        conv_op_lite
@@ -46,3 +47,6 @@ lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_
 lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
+lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
+             DEPS pool_op_lite memory_lite
+             ARM_DEPS pool_compute_arm)
--- a/paddle/fluid/lite/operators/pool_op.cc
+++ b/paddle/fluid/lite/operators/pool_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/operators/pool_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool PoolOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  const auto& x_dims = param_.x->dims();
+  const auto& ksize = param_.ksize;
+  const auto& strides = param_.strides;
+  const auto& paddings = param_.paddings;
+  // "Pooling intput should be 4-D or 5-D tensor."
+  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
+  // Input size and pooling size should be consistent.
+  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
+  // Strides size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == strides.size());
+  // Paddings size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == paddings.size());
+  return true;
+}
+int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
+                   bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
+  return output_size;
+}
+bool PoolOpLite::InferShape() const {
+  const auto x_dims = param_.x->dims();
+  std::vector<int>& ksize = param_.ksize;
+  if (param_.global_pooling) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      param_.paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_dims[i + 2]);
+    }
+  }
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  if (param_.adaptive) {
+    output_shape.insert(output_shape.end(), param_.ksize.begin(),
+                        param_.ksize.end());
+  } else {
+    for (size_t i = 0; i < param_.ksize.size(); ++i) {
+      output_shape.push_back(
+          PoolOutputSize(x_dims[i + 2], param_.ksize[i], param_.paddings[i],
+                         param_.strides[i], param_.ceil_mode));
+    }
+  }
+  param_.output->Resize(lite::DDim(output_shape));
+  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  // ctx->ShareLoD("X", "Out");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(pool, paddle::lite::operators::PoolOpLite);
--- a/paddle/fluid/lite/operators/pool_op.h
+++ b/paddle/fluid/lite/operators/pool_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class PoolOpLite : public OpLite {
+ public:
+  PoolOpLite() {}
+  explicit PoolOpLite(const std::string &type) : OpLite(type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  /*
+  bool Run() override {
+    CHECK(kernel_);
+    kernel_->Run();
+    return true;
+  }
+   */
+  // TODO(Superjomn) replace framework::OpDesc with a lite one.
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    CHECK(scope->FindVar(x));
+    CHECK(scope->FindVar(out));
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.pooling_type = op_desc.GetAttr<std::string>("pooling_type");
+    param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
+    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
+    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    param_.exclusive = op_desc.GetAttr<bool>("exclusive");
+    param_.adaptive = op_desc.GetAttr<bool>("adaptive");
+    param_.ceil_mode = op_desc.GetAttr<bool>("ceil_mode");
+    param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
+    // param_.data_format = op_desc.GetAttr<bool>("data_format");
+    return true;
+  }
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "pool"; }
+ private:
+  mutable PoolParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/operators/pool_op_test.cc
+++ b/paddle/fluid/lite/operators/pool_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/operators/pool_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+TEST(pool_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  x->Resize(DDim(std::vector<int64_t>({1, 3, 224, 224})));
+  output->Resize(DDim(std::vector<int64_t>{1, 3, 112, 112}));
+  // set data
+  for (int i = 0; i < 1 * 3 * 224 * 224; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 1 * 3 * 112 * 112; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("pool");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+  std::string pooling_type("max");
+  desc.SetAttr("pooling_type", pooling_type);
+  // desc.SetAttr("ksize", static_cast<std::vector<int>>({2, 2}));
+  std::vector<int> ksize{2, 2};
+  desc.SetAttr("ksize", ksize);
+  bool global_pooling{false};
+  desc.SetAttr("global_pooling", global_pooling);
+  std::vector<int> strides{1, 1};
+  desc.SetAttr("strides", strides);
+  std::vector<int> paddings{0, 0};
+  desc.SetAttr("paddings", paddings);
+  bool exclusive{true};
+  desc.SetAttr("exclusive", exclusive);
+  bool adaptive{false};
+  desc.SetAttr("adaptive", adaptive);
+  bool ceil_mode{false};
+  desc.SetAttr("ceil_mode", ceil_mode);
+  bool use_quantizer{false};
+  desc.SetAttr("use_quantizer", use_quantizer);
+  PoolOpLite pool("pool");
+  pool.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  pool.Attach(desc, &scope);
+  auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
+  LOG(INFO) << "kernels.size(): " << kernels.size();
+  ASSERT_FALSE(kernels.empty());
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
+#endif