update pooling 2-padding to 4-padding (#2410)

* fix pooling bug and speed * fix build error * delete VLOGin pool, test=develop * add openmp, test=develop * fix lite/kernels/arm/pool_compute_test basic_pooling compute error bug, test=develop * update pooling 2-pad to 4-pad, test=develop * fix 2-pad to 4-pad in operators/pool_op.h, AttachKernel will set param, so 2-pad to 4-pad funcs should put in AttachKernel. test=ddevellop * put 2-pad to 4-pad in AttachImpl, test=develop * according to reviews, fix some format error. test=develop * fix format errorr, add (). test=develop * change paddings type to support dynamically modify, test=develop * update padding type int other devices, test=develop * fix x8d build error on shared_ptr, test=ddevelop * fix formmat in operators pool_op.cc, test=develop

update pooling 2-padding to 4-padding (#2410)
* fix pooling bug and speed * fix build error * delete VLOGin pool, test=develop * add openmp, test=develop * fix lite/kernels/arm/pool_compute_test basic_pooling compute error bug, test=develop * update pooling 2-pad to 4-pad, test=develop * fix 2-pad to 4-pad in operators/pool_op.h, AttachKernel will set param, so 2-pad to 4-pad funcs should put in AttachKernel. test=ddevellop * put 2-pad to 4-pad in AttachImpl, test=develop * according to reviews, fix some format error. test=develop * fix format errorr, add (). test=develop * change paddings type to support dynamically modify, test=develop * update padding type int other devices, test=develop * fix x8d build error on shared_ptr, test=ddevelop * fix formmat in operators pool_op.cc, test=develop
a7f7d49b · HappyAngel · GitHub · ee7ba3ab · a7f7d49b · a7f7d49b
18 changed file
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -46,7 +46,7 @@ void pooling_basic(const float* din,
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
  int size_channel_in = win * hin;
  int size_channel_out = wout * hout;
  if (global_pooling) {
@@ -125,18 +125,22 @@ void pooling_basic(const float* din,
                int bh = kernel_h;
                int bw = kernel_w;
                if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                  bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                    bw += pad_w;
                  }
                }
                if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                  bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                    bh += pad_h;
                  }
                }

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -45,13 +45,14 @@ class PoolingPE : public PE {

    PoolingArgs args = {0};
    args.mode = param_.type;
+    auto paddings = *param_.paddings;
    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
    args.image.address = input->data<float16>();
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
+    args.image.pad_height = paddings[0];
+    args.image.pad_width = paddings[2];
    args.image.scale_address = input->scale();
    args.output.address = output->mutableData<float16>();
    args.output.scale_address = output->scale();
@@ -76,12 +77,13 @@ class PoolingPE : public PE {
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
+    auto paddings = *param_.paddings;

    int image_height = input->shape().height();
    int image_width = input->shape().width();
    int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
-    int image_pad_w = param_.paddings[1];
+    int image_pad_h = paddings[0];
+    int image_pad_w = paddings[2];
    int kernel_height = param_.kernelSize[1];
    int kernel_width = param_.kernelSize[0];
    int kernel_step_h = param_.strides[0];

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -49,7 +49,7 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int stride_height = strides[0];
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];

    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;
@@ -130,7 +130,7 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int stride_height = strides[0];
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;

@@ -213,7 +213,7 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
    const int stride_height = strides[0];
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;

@@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
    const int stride_height = strides[0];
    const int stride_width = strides[1];
    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;


--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -38,7 +38,7 @@ void PoolCompute::Run() {

  std::vector<int>& ksize = param.ksize;
  std::vector<int>& strides = param.strides;
-  std::vector<int>& paddings = param.paddings;
+  std::vector<int>& paddings = *param.paddings;

  std::string& pooling_type = param.pooling_type;
  bool global_pooling = param.global_pooling;
@@ -48,12 +48,15 @@ void PoolCompute::Run() {
  bool use_quantizer = param.use_quantizer;
  std::string& data_format = param.data_format;

-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[1]);
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);

+  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
+                   (paddings[0] == paddings[2]);
  if (global_pooling) {
    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
      ksize[i] = static_cast<int>(in_dims[i + 2]);
    }
    if (pooling_type == "max") {
@@ -80,7 +83,8 @@ void PoolCompute::Run() {
      return;
    }
  } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal &&
+        kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling2x2s2_max(din,
                                          dout,
@@ -106,7 +110,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p1_max(din,
                                            dout,
@@ -132,7 +136,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p0_max(din,
                                            dout,
@@ -158,7 +162,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p0_max(din,
                                            dout,
@@ -184,7 +188,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p1_max(din,
                                            dout,

--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
@@ -25,14 +26,21 @@ namespace lite {
 namespace kernels {
 namespace arm {

-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
  int output_size;
  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
  } else {
    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
  }
  return output_size;
 }
@@ -40,10 +48,12 @@ int PoolOutputSize(
 std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
  const auto x_dims = param_->x->dims();
  std::vector<int>& ksize = param_->ksize;
+  auto paddings = *param_->paddings;
  if (param_->global_pooling) {
    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
      ksize[i] = static_cast<int>(x_dims[i + 2]);
    }
  }
@@ -56,7 +66,8 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
    for (size_t i = 0; i < param_->ksize.size(); ++i) {
      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                            param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                            param_->strides[i],
                                            param_->ceil_mode));
    }
@@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) {

  std::vector<int> ksize = param.ksize;
  std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;

  std::string pooling_type = param.pooling_type;
  bool global_pooling = param.global_pooling;
@@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
  int size_channel_in = win * hin;
  int size_channel_out = wout * hout;
  if (global_pooling) {
@@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) {
                int bh = kernel_h;
                int bw = kernel_w;
                if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                  bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                    bw += pad_w;
                  }
                }
                if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                  bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                    bh += pad_h;
                  }
                }
@@ -225,75 +240,92 @@ TEST(pool_arm, compute) {
        for (auto exclusive : {true, false}) {
          for (auto ksize : {2, 3}) {
            for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3}) {
+              for (auto pad_left : {0, 1}) {
+                for (auto pad_right : {0, 1}) {
+                  for (auto pad_top : {0, 1}) {
+                    for (auto pad_bottom : {0, 1}) {
+                      for (auto n : {1, 2}) {
+                        for (auto c : {1, 3}) {
 #if 1
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
+                          for (auto h : {2, 3, 4, 11}) {
+                            for (auto w : {2, 3, 4, 11}) {
 #else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
+                          for (int h = 2; h < 25; h++) {
+                            for (int w = 2; w < 25; w++) {
 #endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                              VLOG(3) << "n:" << n << " c:" << c << " h:" << h
+                                      << " w:" << w << " ksize:" << ksize
+                                      << " stride:" << stride
+                                      << " pad_left:" << pad_left
+                                      << " pad_right:" << pad_right
+                                      << " pad_top:" << pad_top
+                                      << " pad_bottom:" << pad_bottom
+                                      << " exclusive:" << exclusive
+                                      << " global_pooling:" << global_pooling
+                                      << " ceil_mode: " << ceil_mode
+                                      << " pooling_type:" << pooling_type;

-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float>();
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          x_data[i] = sign * (i % 128);
-                        }
+                              // init x, output
+                              x.Resize(
+                                  DDim(std::vector<int64_t>({n, c, h, w})));
+                              auto* x_data = x.mutable_data<float>();
+                              for (int i = 0; i < x.dims().production(); ++i) {
+                                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                                x_data[i] = sign * (i % 128);
+                              }

-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
+                              // fill param
+                              param.x = &x;
+                              param.output = &output;
+                              param.pooling_type = pooling_type;
+                              if (global_pooling) {
+                                param.ksize = {h, w};
+                              } else {
+                                param.ksize = {ksize, ksize};
+                              }
+                              param.global_pooling = global_pooling;
+                              param.strides = {stride, stride};
+                              std::vector<int> paddings = {
+                                  pad_top, pad_bottom, pad_left, pad_right};
+                              param.exclusive = exclusive;
+                              param.paddings =
+                                  std::make_shared<std::vector<int>>(paddings);
+                              param.ceil_mode = ceil_mode;
+                              param.adaptive = false;
+                              param.use_quantizer = false;

-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
+                              const std::vector<int64_t>& output_shape =
+                                  compute_output_shape(&param);
+                              output.Resize(DDim(output_shape));
+                              output_ref.Resize(DDim(output_shape));

-                        auto* output_data = output.mutable_data<float>();
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
+                              auto* output_data = output.mutable_data<float>();
+                              auto* output_ref_data =
+                                  output_ref.mutable_data<float>();
+                              for (int i = 0; i < output.dims().production();
+                                   ++i) {
+                                output_data[i] = -2;
+                                output_ref_data[i] = -2;
+                              }

-                        // compute
-                        pool.SetParam(param);
-                        pool.Run();
+                              // compute
+                              pool.SetParam(param);
+                              pool.Run();

-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
+                              // compute ref
+                              param.output = &output_ref;
+                              pool_compute_ref(param);

-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+                              // compare
+                              for (int i = 0; i < output.dims().production();
+                                   i++) {
+                                EXPECT_NEAR(
+                                    output_data[i], output_ref_data[i], 1e-4);
+                              }
+                              VLOG(3) << "compare pass";
+                            }
+                          }
                        }
-                        VLOG(3) << "compare pass";
                      }
                    }
                  }

--- a/lite/kernels/cuda/pool_compute.cu
+++ b/lite/kernels/cuda/pool_compute.cu
@@ -256,6 +256,7 @@ void PoolCompute::Run() {
  bool adaptive = param.adaptive;
  auto x_dims = param.x->dims();
  auto out_dims = param.output->dims();
+  auto paddings = *param.paddings;
  const int in_h = x_dims[2];
  const int in_w = x_dims[3];
  const int out_h = out_dims[2];
@@ -266,8 +267,8 @@ void PoolCompute::Run() {
  const int win_w = param.ksize[1];
  const int stride_h = param.strides[0];
  const int stride_w = param.strides[1];
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
  const int total_threads = out_dims.production();
  const int threads = 512;
  const int blocks = (total_threads + threads - 1) / threads;

--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -27,14 +27,21 @@ namespace cuda {
 using Tensor = lite::Tensor;
 using DDim = lite::DDim;

-static int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+static int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int pad_left,
+                          int pad_right,
+                          int stride,
+                          bool ceil_mode) {
  int output_size;
  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
  } else {
    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
  }
  return output_size;
 }
@@ -44,8 +51,10 @@ static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
  std::vector<int>& ksize = param_->ksize;
  if (param_->global_pooling) {
    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    auto paddings = *param_->paddings;
    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
      ksize[i] = static_cast<int>(x_dims[i + 2]);
    }
  }
@@ -58,7 +67,8 @@ static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
    for (size_t i = 0; i < param_->ksize.size(); ++i) {
      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                            param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                            param_->strides[i],
                                            param_->ceil_mode));
    }
@@ -75,7 +85,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {

  std::vector<int> ksize = param.ksize;
  std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;

  std::string pooling_type = param.pooling_type;
  bool global_pooling = param.global_pooling;
@@ -99,7 +109,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];

  if (global_pooling == true) {
    for (int n = 0; n < in_n; ++n) {
@@ -226,7 +236,9 @@ TEST(pool_cuda, compute) {
                        }
                        param.global_pooling = global_pooling;
                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
                        param.exclusive = exclusive;
                        param.ceil_mode = ceil_mode;
                        param.adaptive = false;

--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -48,8 +48,13 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
  auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());

  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
+  bool pads_equal = (padding[0] == padding[1]) && (padding[2] == padding[3]);
+  if (!pads_equal) {
+    LOG(FATAL)
+        << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+  }
  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]};
+      ge::AttrValue::LIST_INT{padding[0], padding[1], padding[2], padding[3]};
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
  int npu_ceil_mode = 0;

--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
@@ -61,7 +61,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];

  if (global_pooling == true) {
    for (int n = 0; n < in_n; ++n) {
@@ -163,7 +163,8 @@ void test_pool(int bs,
  opdesc.SetAttr("global_pooling", global_pooling);
  opdesc.SetAttr("exclusive", exclusive);
  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));

  // create and convert op to NPU model, then run it on NPU
  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);

--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -44,16 +44,22 @@ class PoolCompute
    const auto& out_dims = param.output->dims();
    const std::string pooling_type = param.pooling_type;
    const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = param.paddings;
+    std::vector<int> paddings = *param.paddings;
    std::vector<int> strides = param.strides;
    std::vector<int> ksize = param.ksize;
    if (global_pooling) {
      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
        ksize[i] = static_cast<int>(in_dims[i + 2]);
      }
    }
-
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    auto* input_buf = param.x->data<float, cl::Buffer>();
@@ -89,7 +95,7 @@ class PoolCompute
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, *output_buf);
    CL_CHECK_FATAL(status);

--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <gtest/gtest.h>
+#include <memory>
 #include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
@@ -88,9 +89,10 @@ TEST(pool2d, compute) {
  param.output = &out;
  param.global_pooling = true;
  param.pooling_type = "avg";
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
  param.strides = std::vector<int>{1, 1};
  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);

  std::unique_ptr<KernelContext> context(new KernelContext);
  context->As<OpenCLContext>().InitOnce();

--- a/lite/kernels/x86/pool_compute.h
+++ b/lite/kernels/x86/pool_compute.h
@@ -35,7 +35,6 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto& param = *param_.get_mutable<param_t>();
    if (param.global_pooling) {
      for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
      }
    }
@@ -52,7 +51,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                         param.x,
                         param.ksize,
                         param.strides,
-                         param.paddings,
+                         *param.paddings,
                         pool_process,
                         true,
                         false,
@@ -68,7 +67,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                         param.x,
                         param.ksize,
                         param.strides,
-                         param.paddings,
+                         *param.paddings,
                         pool_process,
                         param.exclusive,
                         param.adaptive,

--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) {
  param.x = &x;
  param.output = &out;
  param.strides = {2, 2};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
  param.ksize = {2, 2};
  param.pooling_type = "max";
  std::unique_ptr<KernelContext> ctx(new KernelContext);

--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
@@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];

  if (global_pooling == true) {
    for (int n = 0; n < in_n; ++n) {
@@ -162,7 +162,8 @@ void test_pool(int bs,
  opdesc.SetAttr("global_pooling", global_pooling);
  opdesc.SetAttr("exclusive", exclusive);
  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
  opdesc.SetAttr("ceil_mode", ceil_mode);

  // create and convert op to XPU model, then run it on XPU

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -302,7 +303,12 @@ struct PoolParam {
  bool global_pooling{
      false};  // if true, knernel size and paddings will be ignored
  std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
  bool exclusive{true};
  bool adaptive{false};
  bool ceil_mode{false};

--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/operators/pool_op.h"
+#include <algorithm>
 #include "lite/core/op_registry.h"

 namespace paddle {
@@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const {
  const auto& x_dims = param_.x->dims();
  const auto& ksize = param_.ksize;
  const auto& strides = param_.strides;
-  const auto& paddings = param_.paddings;
+  const auto& paddings = *param_.paddings;

  // "Pooling intput should be 4-D or 5-D tensor."
  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
@@ -34,20 +35,60 @@ bool PoolOpLite::CheckShape() const {
  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
  // Strides size and pooling size should be the same.
  CHECK_OR_FALSE(ksize.size() == strides.size());
-  // Paddings size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == paddings.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);

  return true;
 }

-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+inline void UpdatePadding(std::vector<int>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const lite::DDim data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
  int output_size;
  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
  } else {
    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
  }
  return output_size;
 }
@@ -55,14 +96,21 @@ int PoolOutputSize(
 bool PoolOpLite::InferShape() const {
  const auto x_dims = param_.x->dims();
  std::vector<int>& ksize = param_.ksize;
+  // dynamic update 4-pad
+  UpdatePadding(param_.paddings.get(),
+                param_.global_pooling,
+                param_.adaptive,
+                padding_algorithm_,
+                x_dims,
+                param_.strides,
+                ksize);
  if (param_.global_pooling) {
    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_.paddings[i] = 0;
      ksize[i] = static_cast<int>(x_dims[i + 2]);
    }
  }
-
+  auto paddings = *param_.paddings;
  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
  if (param_.adaptive) {
    output_shape.insert(
@@ -71,15 +119,14 @@ bool PoolOpLite::InferShape() const {
    for (size_t i = 0; i < param_.ksize.size(); ++i) {
      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                            param_.ksize[i],
-                                            param_.paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                            param_.strides[i],
                                            param_.ceil_mode));
    }
  }
  param_.output->Resize(lite::DDim(output_shape));

-  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  // ctx->ShareLoD("X", "Out");
  return true;
 }


--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -51,7 +52,7 @@ class PoolOpLite : public OpLite {
    param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");

    if (op_desc.HasAttr("exclusive")) {
      param_.exclusive = op_desc.GetAttr<bool>("exclusive");
@@ -65,7 +66,23 @@ class PoolOpLite : public OpLite {
    if (op_desc.HasAttr("use_quantizer")) {
      param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
    }
-    // param_.data_format = op_desc.GetAttr<bool>("data_format");
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
    return true;
  }

@@ -75,6 +92,7 @@ class PoolOpLite : public OpLite {

 private:
  mutable PoolParam param_;
+  std::string padding_algorithm_{""};
 };

 }  // namespace operators

--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in,
  auto kernel_w = param.ksize[1];
  auto h = dim_in[2];
  auto w = dim_in[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
  int stride_h = param.strides[0];
  int stride_w = param.strides[1];
  bool ceil_mode = param.ceil_mode;
@@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in,
  int wout = 1;
  if (!flag_global) {
    if (!ceil_mode) {
-      hout = (h - kernel_h + 2 * pad_h) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w) / stride_w + 1;
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
    } else {
-      hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1;
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
    }
  }
  dim_out[2] = hout;
@@ -116,7 +119,7 @@ void pooling_basic(const float* din,
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
  int size_channel_in = win * hin;
  int size_channel_out = wout * hout;
  if (global_pooling) {
@@ -195,18 +198,22 @@ void pooling_basic(const float* din,
                int bh = kernel_h;
                int bw = kernel_w;
                if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                  bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                    bw += pad_w;
                  }
                }
                if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                  bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                    bh += pad_h;
                  }
                }
@@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
  param.ksize = ksize;

  param.strides = strides;
-  param.paddings = pads;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
  param.ceil_mode = ceil_mode;
  param.global_pooling = flag_global;
  param.pooling_type = pooling_type;
@@ -399,31 +406,38 @@ TEST(TestPoolRand, test_pool_rand) {
      for (auto& kw : {1, 2, 3}) {
        for (auto& kh : {1, 2, 3}) {
          for (auto& stride : {1, 2}) {
-            for (auto& pad : {0, 1, 2}) {
-              for (auto& flag_global : {false, true}) {
-                for (auto& exclusive : {false, true}) {
-                  for (auto& ceil_mode : {false, true}) {
-                    for (auto& pooling_type : {"max", "avg"}) {
-                      bool adaptive = false;
-                      bool use_quantizer = false;
-                      std::vector<DDim> dims;
-                      for (auto& batch : {1, 2}) {
-                        for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
-                          dims.push_back(DDim({batch, cin, h, h}));
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& flag_global : {false, true}) {
+                      for (auto& exclusive : {false, true}) {
+                        for (auto& ceil_mode : {false, true}) {
+                          for (auto& pooling_type : {"max", "avg"}) {
+                            bool adaptive = false;
+                            bool use_quantizer = false;
+                            std::vector<DDim> dims;
+                            for (auto& batch : {1, 2}) {
+                              for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
+                                dims.push_back(DDim({batch, cin, h, h}));
+                              }
+                            }
+                            test_pool_fp32(
+                                dims,
+                                {kh, kw},
+                                {stride, stride},
+                                {pad_top, pad_bottom, pad_left, pad_right},
+                                ceil_mode,
+                                flag_global,
+                                exclusive,
+                                adaptive,
+                                use_quantizer,
+                                pooling_type,
+                                {1, 2, 4},
+                                {FLAGS_power_mode});
+                          }
                        }
                      }
-                      test_pool_fp32(dims,
-                                     {kh, kw},
-                                     {stride, stride},
-                                     {pad, pad},
-                                     ceil_mode,
-                                     flag_global,
-                                     exclusive,
-                                     adaptive,
-                                     use_quantizer,
-                                     pooling_type,
-                                     {1, 2, 4},
-                                     {FLAGS_power_mode});
                    }
                  }
                }
@@ -443,7 +457,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) {
      {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
      {FLAGS_kernel_h, FLAGS_kernel_w},
      {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
      FLAGS_ceil_mode,
      FLAGS_flag_global,
      FLAGS_exclusive,