From 24010472d4287b3ae2c740e166c52178dfa58a71 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Sat, 28 Sep 2019 22:13:48 +0800
Subject: [PATCH] fix pool2d pool3d,support asymmetric padding and channel_last
 (#19739)

* fix pool2d pool3d:
1. support asymmetric padding;
2. support padding algorithm:"SAME" and "VALID";
3. support channel_last: data_format NHWC and NDHWC;
4. support inferring shape when input with negative dims in compile time;
5. change doc of python API and c++;
6. fix bug in cuda kernel when Attr(adaptive) is true.

test=develop,test=document_preview

* fix 'tensors' to 'Tensors'. test=develop,test=document_preview

* add test for converage ValueError.test=develop,test=document_preview

* resolve conflict in test_pool2d. test=develop
---
 paddle/fluid/API.spec                         |    4 +-
 paddle/fluid/operators/math/pooling.cc        |  890 +++++++++++++-
 paddle/fluid/operators/math/pooling.cu        |  605 +++++++--
 paddle/fluid/operators/math/pooling.h         |   59 +-
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  246 +++-
 paddle/fluid/operators/pool_op.cc             |  235 ++--
 paddle/fluid/operators/pool_op.h              |  140 ++-
 paddle/fluid/platform/cudnn_helper.h          |   30 +-
 paddle/fluid/platform/dynload/cudnn.h         |    3 +-
 python/paddle/fluid/layers/nn.py              |  266 +++-
 .../fluid/tests/unittests/test_pool2d_op.py   |  866 ++++++++++++-
 .../fluid/tests/unittests/test_pool3d_op.py   | 1086 ++++++++++++++---
 12 files changed, 3933 insertions(+), 497 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f435a79871e..8cf96441635 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -144,8 +144,8 @@ paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size',
 paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82'))
 paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711'))
 paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5'))
-paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'be7e530dcbd603962e25573a63eb145e'))
-paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6'))
+paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive', 'data_format'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True, 'NCHW')), ('document', '630cae697d46b4b575b15d56cf8be25a'))
+paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive', 'data_format'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True, 'NCDHW')), ('document', 'db0035a3132b1dfb12e53c57591fb9f6'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
 paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '9e5a9f4f6d82d34a33d9ca632379cbcc'))
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 30873e9f87f..1e86c2e7a32 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include <algorithm>
+#include <string>
 #include <vector>
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
 /*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
+* Tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
+* and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
@@ -92,12 +96,137 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NHWC");
+
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output->dims()[3] : output->dims()[1];
+    const int output_height =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_width =
+        channel_last ? output->dims()[2] : output->dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+
+              T ele = pool_process.initial();
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_process.compute(input_data[h * input_width + w], &ele);
+                }
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[ph * output_width + pw] = ele;
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              T ele = pool_process.initial();
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_process.compute(
+                      input_data[h * input_width * input_channels +
+                                 w * input_channels + c],
+                      &ele);
+                }
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[ph * output_width * output_channels +
+                          pw * output_channels + c] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
 };
 
 /*
-* All tensors are in NCHW format.
-* Ksize, strides, paddings are two elements. These two elements represent height
+* tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
 * and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
@@ -173,13 +302,147 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
+
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      PoolProcess pool_grad_process, bool exclusive, bool adaptive,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_grad_process.compute(
+                      input_data[h * input_width + w],
+                      output_data[ph * output_width + pw],
+                      output_grad_data[ph * output_width + pw],
+                      static_cast<T>(scale),
+                      input_grad_data + h * input_width + w);
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  auto input_idx =
+                      h * input_width * input_channels + w * input_channels + c;
+                  auto output_idx = ph * output_width * output_channels +
+                                    pw * output_channels + c;
+                  pool_grad_process.compute(
+                      input_data[input_idx], output_data[output_idx],
+                      output_grad_data[output_idx], static_cast<T>(scale),
+                      input_grad_data + input_idx);
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
 };
 
 /*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
+* Tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
+* and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
  public:
@@ -239,8 +502,112 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
       }
     }
   }
-};
 
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              bool stop = false;
+              for (int h = hstart; h < hend && !stop; ++h) {
+                for (int w = wstart; w < wend && !stop; ++w) {
+                  int input_idx = h * input_width + w;
+                  int output_idx = ph * output_width + pw;
+                  if (input_data[input_idx] == output_data[output_idx]) {
+                    input_grad_data[input_idx] += output_grad_data[output_idx];
+                    stop = true;
+                  }
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              bool stop = false;
+              for (int h = hstart; h < hend && !stop; ++h) {
+                for (int w = wstart; w < wend && !stop; ++w) {
+                  int input_idx =
+                      h * input_width * input_channels + w * input_channels + c;
+                  int output_idx = ph * output_width * output_channels +
+                                   pw * output_channels + c;
+                  if (input_data[input_idx] == output_data[output_idx]) {
+                    input_grad_data[input_idx] += output_grad_data[output_idx];
+                    stop = true;
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
 template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
 
@@ -266,10 +633,13 @@ template class Pool2dGradFunctor<platform::CPUDeviceContext,
                                  double>;
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
@@ -359,13 +729,180 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output->dims()[4] : output->dims()[1];
+    const int output_depth =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_height =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int output_width =
+        channel_last ? output->dims()[3] : output->dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+                int output_idx = (pd * output_height + ph) * output_width + pw;
+                T ele = pool_process.initial();
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      pool_process.compute(
+                          input_data[(d * input_height + h) * input_width + w],
+                          &ele);
+                    }
+                  }
+                }
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                pool_process.finalize(static_cast<T>(pool_size), &ele);
+                output_data[output_idx] = ele;
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+
+                T ele = pool_process.initial();
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      pool_process.compute(input_data[input_idx], &ele);
+                    }
+                  }
+                }
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                pool_process.finalize(static_cast<T>(pool_size), &ele);
+                int output_idx =
+                    ((pd * output_height + ph) * output_width + pw) *
+                        output_channels +
+                    c;
+                output_data[output_idx] = ele;
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
 };
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
@@ -461,13 +998,187 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       }
     }
   }
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      PoolProcess pool_grad_process, bool exclusive, bool adaptive,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                float scale = 1.0 / pool_size;
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      int input_idx = (d * input_height + h) * input_width + w;
+                      int output_idx =
+                          (pd * output_height + ph) * output_width + pw;
+                      pool_grad_process.compute(
+                          input_data[input_idx], output_data[output_idx],
+                          output_grad_data[output_idx], static_cast<T>(scale),
+                          input_grad_data + input_idx);
+                    }
+                  }
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                float scale = 1.0 / pool_size;
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      int output_idx =
+                          ((pd * output_height + ph) * output_width + pw) *
+                              output_channels +
+                          c;
+                      pool_grad_process.compute(
+                          input_data[input_idx], output_data[output_idx],
+                          output_grad_data[output_idx], static_cast<T>(scale),
+                          input_grad_data + input_idx);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
 };
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
  public:
@@ -541,8 +1252,139 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
       }
     }
   }
-};
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
 
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+            for (int ph = 0; ph < output_height; ++ph) {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+              for (int pw = 0; pw < output_width; ++pw) {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+                bool stop = false;
+                for (int d = dstart; d < dend && !stop; ++d) {
+                  for (int h = hstart; h < hend && !stop; ++h) {
+                    for (int w = wstart; w < wend && !stop; ++w) {
+                      int input_idx = (d * input_height + h) * input_width + w;
+                      int output_idx =
+                          (pd * output_height + ph) * output_width + pw;
+
+                      if (input_data[input_idx] == output_data[output_idx]) {
+                        input_grad_data[input_idx] +=
+                            output_grad_data[output_idx];
+                        stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+            for (int ph = 0; ph < output_height; ++ph) {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+              for (int pw = 0; pw < output_width; ++pw) {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+                bool stop = false;
+
+                for (int d = dstart; d < dend && !stop; ++d) {
+                  for (int h = hstart; h < hend && !stop; ++h) {
+                    for (int w = wstart; w < wend && !stop; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      int output_idx =
+                          ((pd * output_height + ph) * output_width + pw) *
+                              output_channels +
+                          c;
+
+                      if (input_data[input_idx] == output_data[output_idx]) {
+                        input_grad_data[input_idx] +=
+                            output_grad_data[output_idx];
+                        stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
 template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
 
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index efce3f899a4..29c0a85d402 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,13 +29,22 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int ksize_width, const int stride_height,
                              const int stride_width, const int padding_height,
                              const int padding_width, PoolProcess pool_process,
-                             bool exclusive, bool adaptive, T* output_data) {
+                             bool exclusive, bool adaptive, T* output_data,
+                             bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
+    int pw, ph, c, batch_idx;
+    if (!channel_last) { /*NCHW*/
+      pw = index % output_width;
+      ph = (index / output_width) % output_height;
+      c = (index / output_width / output_height) % channels;
+      batch_idx = index / output_width / output_height / channels;
+    } else { /*NHWC*/
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      batch_idx = index / channels / output_width / output_height;
+    }
 
     int hstart, hend;
     int wstart, wend;
@@ -55,11 +64,17 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
       wstart = max(wstart, 0);
     }
 
-    input_data += (batch_idx * channels + c) * input_height * input_width;
+    if (!channel_last) {
+      input_data += (batch_idx * channels + c) * input_height * input_width;
+    } else {
+      input_data += batch_idx * input_height * input_width * channels;
+    }
     T ele = pool_process.initial();
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        pool_process.compute(input_data[h * input_width + w], &ele);
+        auto input_idx = channel_last ? (h * input_width + w) * channels + c
+                                      : h * input_width + w;
+        pool_process.compute(input_data[input_idx], &ele);
       }
     }
     int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
@@ -68,7 +83,6 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     output_data[index] = ele;
   }
 }
-
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
     const int nthreads, const T* input_data, const T* output_data,
@@ -76,13 +90,23 @@ __global__ void KernelPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad,
+    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
-    int h_offset = (index / input_width) % input_height + padding_height;
-    int offsetC = (index / input_width / input_height) % channels;
-    int batch_idx = index / input_width / input_height / channels;
+    int w_offset, h_offset, offsetC, batch_idx;
+    if (!channel_last) { /* NCHW */
+      w_offset = index % input_width + padding_width;
+      h_offset = (index / input_width) % input_height + padding_height;
+      offsetC = (index / input_width / input_height) % channels;
+      batch_idx = index / input_width / input_height / channels;
+    } else { /* NHWC */
+      offsetC = index % channels;
+      w_offset = (index / channels) % input_width + padding_width;
+      h_offset =
+          (index / channels / input_width) % input_height + padding_height;
+      batch_idx = index / channels / input_width / input_height;
+    }
 
     int phstart, phend;
     int pwstart, pwend;
@@ -105,10 +129,18 @@ __global__ void KernelPool2DGrad(
     }
     T gradient = 0;
     T input = input_data[index];
-    int output_idx =
-        (batch_idx * channels + offsetC) * output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
+
+    int output_stride;
+    if (!channel_last) {
+      output_stride =
+          (batch_idx * channels + offsetC) * output_height * output_width;
+    } else {
+      output_stride = batch_idx * output_height * output_width * channels;
+    }
+
+    output_data += output_stride;
+    output_grad += output_stride;
+
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         int pool_size;
@@ -127,7 +159,9 @@ __global__ void KernelPool2DGrad(
           pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                 : ksize_height * ksize_width;
         }
-        int output_sub_idx = ph * output_width + pw;
+        int output_sub_idx = channel_last
+                                 ? (ph * output_width + pw) * channels + offsetC
+                                 : ph * output_width + pw;
         pool_process.compute(input, output_data[output_sub_idx],
                              output_grad[output_sub_idx],
                              static_cast<T>(1.0 / pool_size), &gradient);
@@ -144,14 +178,21 @@ __global__ void KernelMaxPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad) {
+    T* input_grad, bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int c = (index / output_width / output_height) % channels;
-    int batch_idx = index / output_width / output_height / channels;
-
+    int pw, ph, c, batch_idx;
+    if (!channel_last) { /* NCHW */
+      pw = index % output_width;
+      ph = (index / output_width) % output_height;
+      c = (index / output_width / output_height) % channels;
+      batch_idx = index / output_width / output_height / channels;
+    } else { /* NHWC */
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      batch_idx = index / channels / output_width / output_height;
+    }
     int hstart = ph * stride_height - padding_height;
     int hend = min(hstart + ksize_height, input_height);
     hstart = max(hstart, 0);
@@ -160,16 +201,24 @@ __global__ void KernelMaxPool2DGrad(
     int wend = min(wstart + ksize_width, input_width);
     wstart = max(wstart, 0);
 
-    input_data += (batch_idx * channels + c) * input_height * input_width;
-    input_grad += (batch_idx * channels + c) * input_height * input_width;
+    int input_stride;
+    if (!channel_last) {
+      input_stride = (batch_idx * channels + c) * input_height * input_width;
+    } else {
+      input_stride = batch_idx * input_height * input_width * channels;
+    }
+    input_data += input_stride;
+    input_grad += input_stride;
 
     T ele = output_data[index];
     int maxIndex = -1;
     bool stop = false;
     for (int h = hstart; h < hend && !stop; ++h) {
       for (int w = wstart; w < wend && !stop; ++w) {
-        if (ele == input_data[h * input_width + w]) {
-          maxIndex = h * input_width + w;
+        int input_data_idx = channel_last ? (h * input_width + w) * channels + c
+                                          : h * input_width + w;
+        if (ele == input_data[input_data_idx]) {
+          maxIndex = input_data_idx;
           stop = true;
         }
       }
@@ -214,10 +263,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 }
 
 /*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
+* Tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
+* and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -254,13 +305,57 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         stride_width, padding_height, padding_width, pool_process, exclusive,
         adaptive, output_data);
   }
-};
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output->dims()[3] : output->dims()[1];
+    const int output_height =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_width =
+        channel_last ? output->dims()[2] : output->dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, exclusive,
+        adaptive, output_data, channel_last);
+  }
+};
 /*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
+* Tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
+* and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -302,13 +397,62 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         ksize_width, stride_height, stride_width, padding_height, padding_width,
         pool_process, exclusive, adaptive, input_grad_data);
   }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, PoolProcess pool_process, bool exclusive,
+      bool adaptive, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, exclusive, adaptive, input_grad_data, channel_last);
+  }
 };
 
 /*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
+* Tensors are in NCHW or NHWC format.
+* Ksize, strides are two elements. These two elements represent height
+* and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename T>
 class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -350,6 +494,51 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
         ksize_width, stride_height, stride_width, padding_height, padding_width,
         input_grad_data);
   }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data, channel_last);
+  }
 };
 
 template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
@@ -389,15 +578,26 @@ __global__ void KernelPool3D(
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
+    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
+    int pw, ph, pd, c, batch_idx;
+    if (!channel_last) {
+      pw = index % output_width;
+      ph = (index / output_width) % output_height;
+      pd = (index / output_width / output_height) % output_depth;
+      c = (index / output_width / output_height / output_depth) % channels;
+      batch_idx =
+          index / output_width / output_height / output_depth / channels;
+    } else {
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      pd = (index / channels / output_width / output_height) % output_depth;
+      batch_idx =
+          index / channels / output_width / output_height / output_depth;
+    }
 
     int dstart, dend;
     int hstart, hend;
@@ -422,14 +622,26 @@ __global__ void KernelPool3D(
       hstart = max(hstart, 0);
       wstart = max(wstart, 0);
     }
+
+    int input_data_stride;
+    if (!channel_last) { /* NCDHW */
+      input_data_stride =
+          (batch_idx * channels + c) * input_depth * input_height * input_width;
+    } else { /* NDHWC */
+      input_data_stride =
+          batch_idx * input_depth * input_height * input_width * channels;
+    }
+    input_data += input_data_stride;
+
     T ele = pool_process.initial();
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          pool_process.compute(
-              input_data[(d * input_height + h) * input_width + w], &ele);
+          auto input_data_idx =
+              channel_last
+                  ? ((d * input_height + h) * input_width + w) * channels + c
+                  : (d * input_height + h) * input_width + w;
+          pool_process.compute(input_data[input_data_idx], &ele);
         }
       }
     }
@@ -450,15 +662,27 @@ __global__ void KernelPool3DGrad(
     const int ksize_height, const int ksize_width, const int stride_depth,
     const int stride_height, const int stride_width, const int padding_depth,
     const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, bool adaptive, T* input_grad) {
+    bool exclusive, bool adaptive, T* input_grad, bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
-    int h_offset = (index / input_width) % input_height + padding_height;
-    int d_offset =
-        (index / input_width / input_height) % input_depth + padding_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
+    int w_offset, h_offset, d_offset, offsetC, batch_idx;
+    if (!channel_last) { /* "NCDHW" */
+      w_offset = index % input_width + padding_width;
+      h_offset = (index / input_width) % input_height + padding_height;
+      d_offset =
+          (index / input_width / input_height) % input_depth + padding_depth;
+      offsetC = (index / input_width / input_height / input_depth) % channels;
+      batch_idx = index / input_width / input_height / input_depth / channels;
+
+    } else { /* "NDHWC" */
+      offsetC = index % channels;
+      w_offset = (index / channels) % input_width + padding_width;
+      h_offset =
+          (index / channels / input_width) % input_height + padding_height;
+      d_offset = (index / channels / input_width / input_height) % input_depth +
+                 padding_depth;
+      batch_idx = index / channels / input_width / input_height / input_depth;
+    }
 
     int pdstart, pdend;
     int phstart, phend;
@@ -490,10 +714,17 @@ __global__ void KernelPool3DGrad(
 
     T gradient = 0;
     T input = input_data[index];
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
-                     output_height * output_width;
-    output_data += output_idx;
-    output_grad += output_idx;
+
+    int output_stride;
+    if (!channel_last) {
+      output_stride = (batch_idx * channels + offsetC) * output_depth *
+                      output_height * output_width;
+    } else {
+      output_stride =
+          batch_idx * output_depth * output_height * output_width * channels;
+    }
+    output_data += output_stride;
+    output_grad += output_stride;
 
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
@@ -522,7 +753,13 @@ __global__ void KernelPool3DGrad(
                 exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                           : ksize_depth * ksize_height * ksize_width;
           }
-          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+
+          int output_sub_idx =
+              channel_last
+                  ? ((pd * output_height + ph) * output_width + pw) * channels +
+                        offsetC
+                  : (pd * output_height + ph) * output_width + pw;
+
           pool_process.compute(input, output_data[output_sub_idx],
                                output_grad[output_sub_idx],
                                static_cast<T>(1.0 / pool_size), &gradient);
@@ -541,38 +778,64 @@ __global__ void KernelMaxPool3DGrad(
     const int output_height, const int output_width, const int ksize_depth,
     const int ksize_height, const int ksize_width, const int stride_depth,
     const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad) {
+    const int padding_height, const int padding_width, T* input_grad,
+    bool channel_last = false) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
-    int c = (index / output_width / output_height / output_depth) % channels;
-    int batch_idx =
-        index / output_width / output_height / output_depth / channels;
+    int pw, ph, pd, c, batch_idx;
+
+    if (!channel_last) { /*NCDHW*/
+      pw = index % output_width;
+      ph = (index / output_width) % output_height;
+      pd = (index / output_width / output_height) % output_depth;
+      c = (index / output_width / output_height / output_depth) % channels;
+      batch_idx =
+          index / output_width / output_height / output_depth / channels;
+    } else { /*NDHWC*/
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      pd = (index / channels / output_width / output_height) % output_depth;
+      batch_idx =
+          index / channels / output_width / output_height / output_depth;
+    }
+
     int dstart = pd * stride_depth - padding_depth;
     int hstart = ph * stride_height - padding_height;
     int wstart = pw * stride_width - padding_width;
+
     int dend = min(dstart + ksize_depth, input_depth);
     int hend = min(hstart + ksize_height, input_height);
     int wend = min(wstart + ksize_width, input_width);
+
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
+
     T ele = output_data[index];
     bool stop = false;
     int maxIdx = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    input_grad +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
 
+    int input_stride;
+    if (!channel_last) {
+      input_stride =
+          (batch_idx * channels + c) * input_depth * input_height * input_width;
+    } else {
+      input_stride =
+          batch_idx * input_depth * input_height * input_width * channels;
+    }
+    input_data += input_stride;
+    input_grad += input_stride;
     for (int d = dstart; d < dend && !stop; ++d) {
       for (int h = hstart; h < hend && !stop; ++h) {
         for (int w = wstart; w < wend && !stop; ++w) {
-          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+          int input_data_idx =
+              channel_last
+                  ? ((d * input_height + h) * input_width + w) * channels + c
+                  : (d * input_height + h) * input_width + w;
+          if (ele == input_data[input_data_idx]) {
             stop = true;
-            maxIdx = (d * input_height + h) * input_width + w;
+            maxIdx = input_data_idx;
           }
         }
       }
@@ -585,10 +848,13 @@ __global__ void KernelMaxPool3DGrad(
 }
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -632,13 +898,67 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         padding_depth, padding_height, padding_width, pool_process, exclusive,
         adaptive, output_data);
   }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output->dims()[4] : output->dims()[1];
+    const int output_depth =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_height =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int output_width =
+        channel_last ? output->dims()[3] : output->dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process, exclusive,
+        adaptive, output_data, channel_last);
+  }
 };
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -688,13 +1008,69 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         stride_height, stride_width, padding_depth, padding_height,
         padding_width, pool_process, exclusive, adaptive, input_grad_data);
   }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, PoolProcess pool_process, bool exclusive,
+      bool adaptive, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, exclusive, adaptive, input_grad_data,
+        channel_last);  // add channel_last
+  }
 };
 
 /*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
+* tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -743,6 +1119,57 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
         stride_height, stride_width, padding_depth, padding_height,
         padding_width, input_grad_data);
   }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data, channel_last);  // add channel_last
+  }
 };
 
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index e1f8e6df1d1..548612e8de2 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -83,10 +84,11 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
 /*
  * \brief Getting pooling results, and calculating gradient.
  *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
- * number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
- * number of channels, D, H and W is the depth, height and width of feature.
+ * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
+ * is the number of channels, H and W is the height and width of feature.
+ * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
+ * is the number of channels, D, H and W is the depth, height and width of
+ * feature.
  *
  * In max pooling, it is possible that the pooling region has multiple maximum
  * elements. In this case, we should compute the gradient of the first maximum
@@ -115,6 +117,14 @@ class Pool2dFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, bool adaptive, framework::Tensor* output);
+
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -127,6 +137,15 @@ class Pool2dGradFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, bool adaptive, framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -139,6 +158,14 @@ class MaxPool2dGradFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -149,6 +176,13 @@ class Pool3dFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, bool adaptive, framework::Tensor* output);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -161,6 +195,15 @@ class Pool3dGradFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, bool adaptive, framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -173,6 +216,14 @@ class MaxPool3dGradFunctor {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, framework::Tensor* input_grad);
 };
 
 /*
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index b26f1270268..78df73ae183 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
@@ -27,47 +29,117 @@ using PoolingMode = platform::PoolingMode;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
+DataLayout getLayoutFromStr(std::string data_format) {
+  if (data_format == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (data_format == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (data_format == "NCDHW") {
+    return DataLayout::kNCDHW;
+  } else {
+    return DataLayout::kNCDHW;
+  }
+}
+
 template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     Tensor *output = ctx.Output<Tensor>("Out");
-
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>(ctx.GetPlace());
-
+    output->mutable_data<T>(ctx.GetPlace());
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+    // update paddings
+    auto in_x_dims = input->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
       }
     }
 
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
+
+    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+
+    // -----------------transformed tensor ------------------------
+
+    Tensor transformed_input(input->type());
+    Tensor transformed_output(output->type());
     DataLayout layout;
 
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
+    if (data_format == str_NDHWC) {
       layout = DataLayout::kNCDHW;
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 4, 1, 2, 3};
+
+      // input
+      transformed_input.Resize(input->dims());
+
+      auto in_dims_vec = framework::vectorize(input->dims());
+      in_dims_vec[1] = input->dims()[4];
+      in_dims_vec[2] = input->dims()[1];
+      in_dims_vec[3] = input->dims()[2];
+      in_dims_vec[4] = input->dims()[3];
+      transformed_input.Resize(framework::make_ddim(in_dims_vec));
+      transformed_input.mutable_data(ctx.GetPlace(), input->type());
+
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      trans5(dev_ctx, *input, &transformed_input, axis);
+
+      // output
+      transformed_output.Resize(output->dims());
+
+      auto out_dims_vec = framework::vectorize(output->dims());
+      out_dims_vec[1] = output->dims()[4];
+      out_dims_vec[2] = output->dims()[1];
+      out_dims_vec[3] = output->dims()[2];
+      out_dims_vec[4] = output->dims()[3];
+      transformed_output.Resize(framework::make_ddim(out_dims_vec));
+
+    } else {
+      layout = getLayoutFromStr(data_format);
+      transformed_input = *input;
+      transformed_output = *output;
     }
 
+    const T *tranformed_input_data = transformed_input.data<T>();
+    T *tranformed_output_data = transformed_output.mutable_data<T>(
+        transformed_output.dims(), ctx.GetPlace());
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
+        layout, framework::vectorize<int>(transformed_input.dims()));
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
+        layout, framework::vectorize<int>(transformed_output.dims()));
 
     PoolingMode pooling_mode;
     if (pooling_type == "max") {
@@ -83,9 +155,19 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+
     CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
-        cudnn_output_desc, output_data));
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
+        tranformed_input_data, &beta, cudnn_output_desc,
+        tranformed_output_data));
+    // add
+    if (data_format == str_NDHWC) {
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 2, 3, 4, 1};
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      trans5_v2(dev_ctx, transformed_output, output, axis);
+    }
   }
 };
 
@@ -93,8 +175,8 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      "It must use CUDAPlace.");
 
     const Tensor *input = ctx.Input<Tensor>("X");
     const Tensor *output = ctx.Input<Tensor>("Out");
@@ -104,37 +186,109 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    if (ctx.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+    // update paddings
+    auto in_x_dims = input->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
       }
     }
 
-    const T *input_data = input->data<T>();
-    const T *output_data = output->data<T>();
-    const T *output_grad_data = output_grad->data<T>();
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
 
-    // ------------------- cudnn descriptors ---------------------
-    ScopedTensorDescriptor input_desc;
-    ScopedTensorDescriptor output_desc;
-    ScopedPoolingDescriptor pool_desc;
+    // ------- tensor grad --------------
+    Tensor transformed_input(input->type());
+    Tensor transformed_output(output->type());
+    Tensor transformed_output_grad(output_grad->type());
+
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    Tensor transformed_input_grad(input_grad->type());
     DataLayout layout;
+    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+    if (data_format == str_NDHWC) {
+      layout = DataLayout::kNCDHW;
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 4, 1, 2, 3};
+
+      // input
+      transformed_input.Resize(input->dims());
+      auto in_dims_vec = framework::vectorize(input->dims());
+      in_dims_vec[1] = input->dims()[4];
+      in_dims_vec[2] = input->dims()[1];
+      in_dims_vec[3] = input->dims()[2];
+      in_dims_vec[4] = input->dims()[3];
+      transformed_input.Resize(framework::make_ddim(in_dims_vec));
+      transformed_input.mutable_data(ctx.GetPlace(), input->type());
+
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      trans5(dev_ctx, *input, &transformed_input, axis);
+
+      // output
+      transformed_output.Resize(output->dims());
+      auto out_dims_vec = framework::vectorize(output->dims());
+      out_dims_vec[1] = output->dims()[4];
+      out_dims_vec[2] = output->dims()[1];
+      out_dims_vec[3] = output->dims()[2];
+      out_dims_vec[4] = output->dims()[3];
+      transformed_output.Resize(framework::make_ddim(out_dims_vec));
+
+      transformed_output.mutable_data(ctx.GetPlace(), output->type());
+
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      trans5_v2(dev_ctx, *output, &transformed_output, axis);
+
+      // output grad
+      transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
+      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
+
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v3;
+      trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
+
+      // input grad
+      transformed_input_grad.Resize(framework::make_ddim(in_dims_vec));
 
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
     } else {
-      layout = DataLayout::kNCDHW;
+      layout = getLayoutFromStr(data_format);
+      transformed_input = *input;
+      transformed_output = *output;
+      transformed_output_grad = *output_grad;
+      transformed_input_grad = *input_grad;
     }
 
+    const T *input_data = transformed_input.data<T>();
+    const T *output_data = transformed_output.data<T>();
+    const T *output_grad_data = transformed_output_grad.data<T>();
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
+        layout, framework::vectorize<int>(transformed_input.dims()));
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
+        layout, framework::vectorize<int>(transformed_output.dims()));
 
     PoolingMode pooling_mode;
     if (pooling_type == "max") {
@@ -155,13 +309,21 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
-      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      T *input_grad_data = transformed_input_grad.mutable_data<T>(
+          transformed_input_grad.dims(), ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
-
       CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
+
+      if (data_format == str_NDHWC) {
+        auto &dev_ctx =
+            ctx.template device_context<paddle::platform::CUDADeviceContext>();
+        std::vector<int> axis{0, 2, 3, 4, 1};
+        math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v4;
+        trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 073c7fe7561..bde087e080d 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -24,29 +24,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-                   bool ceil_mode) {
+int PoolOutputSize(int input_size, int filter_size, int padding_1,
+                   int padding_2, int stride, bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
+            stride +
+        1;
   }
-  PADDLE_ENFORCE(output_size > 0,
-                 "Due to the settings of padding(%d), filter_size(%d) and "
-                 "stride(%d), the output size is less than 0, please check "
-                 "again. Input_size:%d",
-                 padding, filter_size, stride, input_size);
+  PADDLE_ENFORCE_GT(
+      output_size, 0,
+      "Due to the settings of padding(%d,%d), filter_size(%d) and "
+      "stride(%d), the output size is less than 0, please check "
+      "again. Input_size:%d",
+      padding_1, padding_2, filter_size, stride, input_size);
   return output_size;
 }
 
 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of Pooling should not be null.");
-
-  auto in_x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                    "Out(Output) of Pooling should not be null.");
 
   std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
@@ -54,38 +57,60 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
   bool adaptive = ctx->Attrs().Get<bool>("adaptive");
+  bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
+  std::string data_format = ctx->Attrs().Get<std::string>("data_format");
+  std::string padding_algorithm =
+      ctx->Attrs().Get<std::string>("padding_algorithm");
 
-  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                 "Pooling intput should be 4-D or 5-D tensor.");
-
-  if (ctx->Attrs().Get<bool>("global_pooling")) {
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
+  auto in_x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
+                    "Pooling intput should be 4-D or 5-D tensor.");
 
-  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
-                 "Input size and pooling size should be consistent.");
+  PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
+                    "Input size and pooling size should be consistent.");
   PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
                     "Strides size and pooling size should be the same.");
-  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                    "Paddings size and pooling size should be the same.");
 
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // update paddings if "SAME" or global_pooling
+  framework::DDim data_dims;
+  if (channel_last) {
+    data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+  } else {
+    data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+  }
+  UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                data_dims, strides, ksize);
+
+  if (global_pooling) {
+    UpdateKsize(&ksize, data_dims);
+  }
+
+  std::vector<int64_t> output_shape;
   if (adaptive) {
     output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
   } else {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
-        output_shape.push_back(-1);
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
+        output_shape.push_back(in_x_dims[i]);
       } else {
-        output_shape.push_back(PoolOutputSize(
-            in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
+        output_shape.push_back(
+            PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
+                           paddings[2 * i + 1], strides[i], ceil_mode));
       }
     }
   }
+
+  // output_N = input_N
+  output_shape.insert(output_shape.begin(), in_x_dims[0]);
+  // output_C = input_C
+  if (channel_last) {
+    output_shape.push_back(in_x_dims[in_x_dims.size() - 1]);
+  } else {
+    output_shape.insert(output_shape.begin() + 1, in_x_dims[1]);
+  }
+
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("X", "Out");
 }
@@ -93,7 +118,9 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
+  // std::string data_format = ctx.Attr<std::string>("data_format");  // change:
+  // delete
+  std::string data_format = "AnyLayout";
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
@@ -114,16 +141,18 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                 "Input(X@GRAD) should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
+                    "Input(X@GRAD) should not be null.");
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
+  // std::string data_format = ctx.Attr<std::string>("data_format");  //
+  // change:delete
+  std::string data_format = "AnyLayout";
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
@@ -186,8 +215,8 @@ void Pool2dOpMaker::Make() {
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
-      "operator."
+      "(vector<int>, default {0,0}), paddings(height_top, height_bottom, "
+      "width_left, wifth_right) of pooling operator."
       "If global_pooling = true, paddings and kernel size will be ignored.")
       .SetDefault({0, 0});
   AddAttr<bool>(
@@ -206,7 +235,7 @@ void Pool2dOpMaker::Make() {
 
   AddAttr<bool>(
       "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      "(bool, default false) Only used in cudnn kernel, need install cudnn.")
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
@@ -215,7 +244,7 @@ void Pool2dOpMaker::Make() {
       "the floor function will be used.")
       .SetDefault(false);
   AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
+                "(bool, default false) Only used in mkldnn kernel.")
       .SetDefault(false);
   AddAttr<bool>("use_quantizer",
                 "(bool, default false) "
@@ -229,18 +258,24 @@ void Pool2dOpMaker::Make() {
       "An optional string from: \"NHWC\", \"NCHW\". "
       "Defaults to \"NHWC\". Specify the data format of the output data, "
       "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
+      .SetDefault("NCHW");
   AddAttr<bool>("is_test",
                 "(bool, default false) Set to true for inference only, false "
                 "for training. Some layers may run faster when this is true.")
       .SetDefault(false);
 
+  AddAttr<std::string>(
+      "padding_algorithm",
+      "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+      "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+      "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+      .SetDefault("EXPLICIT");
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+Input(X) and output(Out) are in NCHW or NHWC format, where N is batch size, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
@@ -256,30 +291,47 @@ Example:
 
        Out shape: $(N, C, H_{out}, W_{out})$
 
+  For pool_padding = "SAME":
+       $$
+       H_{out} = \\frac{(H_{in} + strides[0] - 1)}{strides[0]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} + strides[1] - 1)}{strides[1]}
+       $$
+
+  For pool_padding = "VALID":
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[0] + strides[0])}{strides[0]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + strides[1])}{strides[1]}
+       $$
+
   For ceil_mode = false:
        $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + pad_height_top + pad_height_bottom}{strides[0]} + 1
        $$
        $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + pad_width_left + pad_width_right}{strides[1]} + 1
        $$
+
   For ceil_mode = true:
        $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + pad_height_top + pad_height_bottom + strides[0] - 1)}{strides[0]} + 1
        $$
        $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + pad_width_left + pad_width_right + strides[1] - 1)}{strides[1]} + 1
        $$
 
   For exclusive = false:
        $$
-       hstart = i * strides[0] - paddings[0]
+       hstart = i * strides[0] - pad_height_top
        $$
        $$
        hend = hstart + ksize[0]
        $$
        $$
-       wstart = j * strides[1] - paddings[1]
+       wstart = j * strides[1] - pad_width_left
        $$
        $$
        wend = wstart + ksize[1]
@@ -290,13 +342,13 @@ Example:
 
   For exclusive = true:
        $$
-       hstart = max(0, i * strides[0] - paddings[0])
+       hstart = max(0, i * strides[0] - pad_height_top)
        $$
        $$
        hend = min(H, hstart + ksize[0])
        $$
        $$
-       wstart = max(0, j * strides[1] - paddings[1])
+       wstart = max(0, j * strides[1] - pad_width_left)
        $$
        $$
        wend = min(W, wstart + ksize[1])
@@ -319,13 +371,14 @@ class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
 void Pool3dOpMaker::Make() {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
-           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "The format of input tensor is NCDHW or NDHWC, where N is batch "
+           "size, C is "
            "the number of channels, and D, H and W is the depth, height and "
            "width of "
            "the feature, respectively.");
   AddOutput("Out",
             "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW, "
+            "The format of output tensor is also NCDHW or NDHWC, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
             "width of the feature, respectively.");
@@ -355,8 +408,10 @@ void Pool3dOpMaker::Make() {
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, default {0,0,0}), paddings(depth, height, "
-      "width) of pooling operator. "
+      "(vector<int>, default {0,0,0}), paddings(pad_depth_front, "
+      "pad_depth_back, "
+      "pad_height_top, pad_height_bottom, pad_width_left, pad_width_right"
+      ") of pooling operator. "
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
@@ -376,7 +431,7 @@ void Pool3dOpMaker::Make() {
 
   AddAttr<bool>(
       "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      "(bool, default false) Only used in cudnn kernel, need install cudnn.")
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
@@ -389,11 +444,17 @@ void Pool3dOpMaker::Make() {
       .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
-      "(string, default NCHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "(string, default NCDHW) Only used in "
+      "An optional string from: \"NDHWC\", \"NCDHW\". "
+      "Defaults to \"NDHWC\". Specify the data format of the output data, "
       "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
+      .SetDefault("NCDHW");
+  AddAttr<std::string>(
+      "padding_algorithm",
+      "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+      "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+      "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+      .SetDefault("EXPLICIT");
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
@@ -401,7 +462,7 @@ Pool3d Operator.
 
 The pooling3d operation calculates the output based on
 the input, pooling_type, ksize, strides, and paddings parameters.
-Input(X) and output(Out) are in NCDHW format, where N is batch
+Input(X) and output(Out) are in NCDHW or NDHWC format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
 width of the feature, respectively. Parameters(ksize, strides, paddings)
 are three elements. These three elements represent depth, height and
@@ -412,42 +473,65 @@ Example:
        X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+
+  For pool_padding = "SAME":
+       $$
+       D_{out} = \\frac{(D_{in} + strides[0] - 1)}{strides[0]}
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} + strides[1] - 1)}{strides[1]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} + strides[2] - 1)}{strides[2]}
+       $$
+
+  For pool_padding = "VALID":
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + strides[0])}{strides[0]}
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + strides[1])}{strides[1]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + strides[2])}{strides[2]}
+       $$
+
   For ceil_mode = false:
        $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + pad_depth_front + pad_depth_back)}{strides[0]} + 1
        $$
        $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[1] + pad_height_top + pad_height_bottom)}{strides[1]} + 1
        $$
        $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[2] + pad_width_left + pad_width_right)}{strides[2]} + 1
        $$
   For ceil_mode = true:
        $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + pad_depth_front + pad_depth_back + strides[0] -1)}{strides[0]} + 1
        $$
        $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[1] + pad_height_top + pad_height_bottom + strides[1] -1)}{strides[1]} + 1
        $$
        $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[2] + pad_width_left + pad_width_right + strides[2] -1)}{strides[2]} + 1
        $$
 
   For exclusive = false:
        $$
-       dstart = i * strides[0] - paddings[0]
+       dstart = i * strides[0] - pad_depth_front
        $$
        $$
        dend = dstart + ksize[0]
        $$
        $$
-       hstart = j * strides[1] - paddings[1]
+       hstart = j * strides[1] - pad_height_top
        $$
        $$
        hend = hstart + ksize[1]
        $$
        $$
-       wstart = k * strides[2] - paddings[2]
+       wstart = k * strides[2] -  pad_width_left
        $$
        $$
        wend = wstart + ksize[2]
@@ -458,16 +542,19 @@ Example:
 
   For exclusive = true:
        $$
-       dstart = max(0, i * strides[0] - paddings[0])
+       dstart = max(0, i * strides[0] - pad_depth_front)
        $$
        $$
        dend = min(D, dstart + ksize[0])
        $$
        $$
+       hstart = max(0, j * strides[1] - pad_height_top)
+       $$
+       $$
        hend = min(H, hstart + ksize[1])
        $$
        $$
-       wstart = max(0, k * strides[2] - paddings[2])
+       wstart = max(0, k * strides[2] - pad_width_left)
        $$
        $$
        wend = min(W, wstart + ksize[2])
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 6c5900bd0f5..d7c6c6230a8 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
-
 namespace paddle {
 namespace operators {
 
@@ -57,6 +57,57 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override;
 };
+inline void UpdatePadding(std::vector<int>* paddings, const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const framework::DDim data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = framework::vectorize<int>(data_dims);
+  if (paddings->size() == data_dims.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2, paddings->size(),
+        "Paddings size should be the same or twice as the pooling size.");
+  }
+
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int out_size = (data_dims[i] + strides[i] - 1) / strides[0];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], 0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline void UpdateKsize(std::vector<int>* ksize,
+                        const framework::DDim data_dims) {
+  ksize->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < ksize->size(); ++i) {
+    *(ksize->begin() + i) = static_cast<int>(data_dims[i]);
+  }
+}
 
 template <typename DeviceContext, typename T>
 class PoolKernel : public framework::OpKernel<T> {
@@ -69,14 +120,36 @@ class PoolKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::string data_format = context.Attr<std::string>("data_format");
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    bool global_pooling = context.Attr<bool>("global_pooling");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+    // update paddings
+    auto in_x_dims = in_x->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
       }
     }
+
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
+
     auto& dev_ctx = context.template device_context<DeviceContext>();
     switch (ksize.size()) {
       case 2: {
@@ -85,16 +158,16 @@ class PoolKernel : public framework::OpKernel<T> {
               DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, false, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
+                         pool_process, true, false, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
               DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, adaptive, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
+                         pool_process, exclusive, adaptive, out);
         }
       } break;
       case 3: {
@@ -103,15 +176,16 @@ class PoolKernel : public framework::OpKernel<T> {
               DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, false, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
+                         pool_process, true, false, out);
+
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, adaptive, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
+                         pool_process, exclusive, adaptive, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -135,13 +209,33 @@ class PoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
+    std::string data_format = context.Attr<std::string>("data_format");
+    bool global_pooling = context.Attr<bool>("global_pooling");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
-    if (context.Attr<bool>("global_pooling")) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    // update paddings
+    auto in_x_dims = in_x->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
       }
     }
+
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
+
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
@@ -154,15 +248,15 @@ class PoolGradKernel : public framework::OpKernel<T> {
             paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
                 pool2d_backward;
             pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
+                            paddings, data_format, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
                 DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
-                            in_x_grad);
+                            paddings, data_format, pool_process, exclusive,
+                            adaptive, in_x_grad);
           }
         } break;
         case 3: {
@@ -170,15 +264,15 @@ class PoolGradKernel : public framework::OpKernel<T> {
             paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
                 pool3d_backward;
             pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
+                            paddings, data_format, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
                 DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
-                            in_x_grad);
+                            paddings, data_format, pool_process, exclusive,
+                            adaptive, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 8c124e71583..eb312895cf1 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -72,6 +72,7 @@ enum class DataLayout {  // Not use
   kNHWC,
   kNCHW,
   kNCDHW,
+  kNDHWC,  // add, liyamei
   kNCHW_VECT_C,
 };
 
@@ -212,6 +213,8 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
       return CUDNN_TENSOR_NCHW;
     case DataLayout::kNCDHW:
       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+    case DataLayout::kNDHWC:
+      return CUDNN_TENSOR_NHWC;  // add, liyamei
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -238,14 +241,31 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    // NOTE: Here, Assume using NCHW or NCDHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
-        desc_, type, dims_with_group.size(), dims_with_group.data(),
-        strides.data()));
+
+    if (dims.size() == 4) {
+      if (format == CUDNN_TENSOR_NCHW) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+            desc_, type, dims_with_group.size(), dims_with_group.data(),
+            strides.data()));
+      } else {  // CUDNN_TENSOR_NHWC
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
+            desc_, format, type, dims[0], dims[3], dims[1], dims[2]));
+      }
+    } else if (dims.size() == 5) {
+      if (format == CUDNN_TENSOR_NCHW) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+            desc_, type, dims_with_group.size(), dims_with_group.data(),
+            strides.data()));
+      } else {  // CUDNN_TENSOR_NHWC
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+            desc_, format, type, dims.size(), dims.data()));
+      }
+    }
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 67e2a18dd37..0f743801582 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -126,7 +126,8 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnRNNBackwardWeights);                       \
   __macro(cudnnRNNForwardInference);                      \
   __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);
+  __macro(cudnnDestroyRNNDescriptor);                     \
+  __macro(cudnnSetTensorNdDescriptorEx);
 
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0632ad9ad2c..2cdfbf482a1 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2871,15 +2871,16 @@ def pool2d(input,
            use_cudnn=True,
            ceil_mode=False,
            name=None,
-           exclusive=True):
+           exclusive=True,
+           data_format="NCHW"):
     """
     ${comment}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+                          input tensor is `"NCHW"` or `"NHWC"`, where `N` is batch size, `C` is
+                          the number of channels, `H` is the height of the
+                          feature, and `W` is the width of the feature.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
@@ -2887,8 +2888,13 @@ def pool2d(input,
         pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
-            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+        pool_padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Otherwise, the pool padding size will be a square of an int.
         global_pooling (bool): ${global_pooling_comment}
         use_cudnn (bool): ${use_cudnn_comment}
@@ -2896,55 +2902,125 @@ def pool2d(input,
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
         exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
+                          mode, default is `true`.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                `[batch_size, input_channels, input_height, input_width]`.
 
     Returns:
         Variable: The pooling result.
 
     Raises:
-        ValueError: If 'pool_type' is not "max" nor "avg"
-        ValueError: If 'global_pooling' is False and 'pool_size' is -1
-        ValueError: If 'use_cudnn' is not a bool value.
+        ValueError: If `pool_type` is not "max" nor "avg"
+        ValueError: If `global_pooling` is False and `pool_size` is -1
+        ValueError: If `use_cudnn` is not a bool value.
 
     Examples:
 
         .. code-block:: python
 
           import paddle.fluid as fluid
+
           data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          pool2d = fluid.layers.pool2d(
-                            input=data,
-                            pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
+              name='data', shape=[10, 3, 32, 32], append_batch_size=False, dtype='float32')
+
+          # example 1:
+          # Attr(pool_padding) is a list with 4 elements, Attr(data_format) is "NCHW".
+          out_1 = fluid.layers.pool2d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = [1, 2, 1, 0],
+            data_format = "NCHW")
+
+          # example 2:
+          # Attr(pool_padding) is a string, Attr(data_format) is "NCHW".
+          out_2 = fluid.layers.pool2d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = "VALID",
+            data_format = "NCHW")
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
             str(pool_type))
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
+            "and be a valid value. Received pool_size: %s." % str(pool_size))
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): %s." % str(use_cudnn))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
 
     pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
     pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
+    def update_padding(padding, data_format):
+        def is_list_or_tuple(ele):
+            if isinstance(ele, list) or isinstance(ele, tuple):
+                return True
+            return False
+
+        if is_list_or_tuple(padding) and len(padding) == 4:
+            if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:4]
+                padding = [ele for a_list in padding for ele in a_list]
+            elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+                if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[1:3]
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 4, 'padding')
 
-    l_type = 'pool2d'
+        else:
+            padding = utils.convert_to_list(padding, 2, 'padding')
 
-    helper = LayerHelper(l_type, **locals())
+        return padding
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(pool_padding, str):
+        pool_padding = pool_padding.upper()
+        if pool_padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(pool_padding))
+        if pool_padding == "VALID":
+            padding_algorithm = "VALID"
+            pool_padding = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif pool_padding == "SAME":
+            padding_algorithm = "SAME"
+            pool_padding = [0, 0, 0, 0]
+
+    pool_padding = update_padding(pool_padding, data_format)
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
-        type=l_type,
+        type=op_type,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
@@ -2953,10 +3029,12 @@ def pool2d(input,
             "global_pooling": global_pooling,
             "strides": pool_stride,
             "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
             "use_cudnn": use_cudnn,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
             "exclusive": exclusive,
+            "data_format": data_format,
         })
 
     return pool_out
@@ -2972,30 +3050,43 @@ def pool3d(input,
            use_cudnn=True,
            ceil_mode=False,
            name=None,
-           exclusive=True):
+           exclusive=True,
+           data_format="NCDHW"):
     """
     ${comment}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCDHW, where N is batch size, C is
-                          the number of channels, D is the depth of the feature,
-                          H is the height of the feature, and W is the width
+                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
                           of the feature.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
             is a tuple or list, it must contain three integers, 
             (pool_size_Depth, pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
         pool_type (string): ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
-        pool_padding (int): padding size.
+        pool_stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
         global_pooling (bool): ${global_pooling_comment}
         use_cudnn (bool): ${use_cudnn_comment}
         ceil_mode (bool): ${ceil_mode_comment}
         name (str): A name for this layer(optional). If set None, the layer
             will be named automatically.
         exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
+                          mode, default is true.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                `[batch_size, input_channels, input_depth, input_height, input_width]`.
 
     Returns:
         Variable: output of pool3d layer.
@@ -3005,39 +3096,114 @@ def pool3d(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+
           data = fluid.layers.data(
-              name='data', shape=[3, 32, 32, 32], dtype='float32')
-          pool3d = fluid.layers.pool3d(
-                            input=data,
-                            pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
+              name='data', shape=[10, 3, 32, 32, 32], append_batch_size=False, dtype='float32')
+
+          # example 1:
+          # Attr(pool_padding) is a list with 6 elements, Attr(data_format) is "NCDHW".
+          out_1 = fluid.layers.pool3d(
+            input = data,
+            pool_size = 2,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = [1, 2, 1, 0, 1, 2],
+            global_pooling = False,
+            data_format = "NCDHW")
+
+          # example 2:
+          # Attr(pool_padding) is a string, Attr(data_format) is "NCDHW".
+          out_2 = fluid.layers.pool3d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = "VALID",
+            global_pooling = False,
+            data_format = "NCDHW")
+
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
             str(pool_type))
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
+            "and be a valid value. Received Attr(pool_size): %s." %
+            str(pool_size))
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
 
     pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
     pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False")
+    def update_padding(padding, data_format):
+        def is_list_or_tuple(ele):
+            if isinstance(ele, (list, tuple)):
+                return True
+            return False
+
+        if is_list_or_tuple(padding) and len(padding) == 5:
+            if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:5]
+                padding = [ele for a_list in padding for ele in a_list]
+            elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
+                if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[1:4]
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 6, 'padding')
+
+        elif is_list_or_tuple(padding) and len(padding) == 6:
+            padding = utils.convert_to_list(padding, 6, 'padding')
 
-    l_type = "pool3d"
-    helper = LayerHelper(l_type, **locals())
+        else:
+            padding = utils.convert_to_list(padding, 3, 'padding')
+
+        return padding
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(pool_padding, str):
+        pool_padding = pool_padding.upper()
+        if pool_padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(pool_padding))
+        if pool_padding == "VALID":
+            padding_algorithm = "VALID"
+            pool_padding = [0, 0, 0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif pool_padding == "SAME":
+            padding_algorithm = "SAME"
+            pool_padding = [0, 0, 0, 0, 0, 0]
+
+    pool_padding = update_padding(pool_padding, data_format)
+
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
-        type=l_type,
+        type=op_type,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
@@ -3046,10 +3212,12 @@ def pool3d(input,
             "global_pooling": global_pooling,
             "strides": pool_stride,
             "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
             "use_cudnn": use_cudnn,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
             "exclusive": exclusive,
+            "data_format": data_format,
         })
 
     return pool_out
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 9d501b70942..b2f09b6a135 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle.fluid as fluid
 
 
 def adaptive_start_index(index, input_size, output_size):
@@ -107,7 +108,7 @@ def avg_pool2D_forward_naive(x,
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
             field_size = ((r_end - r_start) * (c_end - c_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                if (exclusive or adaptive) else (ksize[0] * ksize[1])
             if data_type == np.int8 or data_type == np.uint8:
                 out[:, :, i, j] = (np.rint(
                     np.sum(x_masked, axis=(2, 3)) /
@@ -118,26 +119,139 @@ def avg_pool2D_forward_naive(x,
     return out
 
 
+def pool2D_forward_naive(x,
+                         ksize,
+                         strides,
+                         paddings,
+                         global_pool=0,
+                         ceil_mode=False,
+                         exclusive=True,
+                         adaptive=False,
+                         data_format='NCHW',
+                         pool_type="max",
+                         padding_algorithm="EXPLICIT"):
+
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCHW":
+                input_data_shape = x.shape[2:4]
+            elif data_format == "NHWC":
+                input_data_shape = x.shape[1:3]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+
+    assert len(paddings) == 2 or len(paddings) == 4
+    is_sys = True if len(paddings) == 2 else False
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if global_pool == 1:
+        ksize = [H, W]
+        paddings = [0 for _ in range(len(paddings))]
+
+    pad_h_up = paddings[0] if is_sys else paddings[0]
+    pad_h_down = paddings[0] if is_sys else paddings[1]
+    pad_w_left = paddings[1] if is_sys else paddings[2]
+    pad_w_right = paddings[1] if is_sys else paddings[3]
+
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1
+        W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+    for i in range(H_out):
+        if adaptive:
+            in_h_start = adaptive_start_index(i, H, ksize[0])
+            in_h_end = adaptive_end_index(i, H, ksize[0])
+        else:
+            in_h_start = np.max((i * strides[0] - pad_h_up, 0))
+            in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H))
+
+        for j in range(W_out):
+            if adaptive:
+                in_w_start = adaptive_start_index(j, W, ksize[1])
+                in_w_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                in_w_start = np.max((j * strides[1] - pad_w_left, 0))
+                in_w_end = np.min((j * strides[1] + ksize[1] - pad_w_left, W))
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
 class TestPool2D_Op(OpTest):
     def setUp(self):
         self.op_type = "pool2d"
         self.use_cudnn = False
+        self.init_kernel_type()
         self.use_mkldnn = False
         self.init_data_type()
         self.init_test_case()
+        self.padding_algorithm = "EXPLICIT"
+        self.init_paddings()
         self.init_global_pool()
         self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
         self.init_exclusive()
         self.init_adaptive()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
+        self.init_data_format()
+        self.init_shape()
+
         input = np.random.random(self.shape).astype(self.dtype)
-        output = (self.pool2D_forward_naive(
+        output = pool2D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive,
-            self.dtype)).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
+            self.pool_type, self.padding_algorithm).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -149,10 +263,10 @@ class TestPool2D_Op(OpTest):
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn,
             'ceil_mode': self.ceil_mode,
-            'data_format':
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
+            'data_format': self.data_format,
             'exclusive': self.exclusive,
-            'adaptive': self.adaptive
+            'adaptive': self.adaptive,
+            "padding_algorithm": self.padding_algorithm,
         }
 
         self.outputs = {'Out': output}
@@ -177,14 +291,22 @@ class TestPool2D_Op(OpTest):
         elif self.pool_type != "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
-    def init_test_case(self):
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_shape(self):
         self.shape = [2, 3, 5, 5]
+
+    def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
+
+    def init_paddings(self):
         self.paddings = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
 
     def init_kernel_type(self):
-        pass
+        self.use_cudnn = False
 
     def init_data_type(self):
         self.dtype = np.float32
@@ -208,9 +330,10 @@ class TestPool2D_Op(OpTest):
 
 class TestCase1(TestPool2D_Op):
     def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
+
+    def init_paddings(self):
         self.paddings = [0, 0]
 
     def init_pool_type(self):
@@ -220,12 +343,16 @@ class TestCase1(TestPool2D_Op):
     def init_global_pool(self):
         self.global_pool = False
 
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
 
 class TestCase2(TestPool2D_Op):
     def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
+
+    def init_paddings(self):
         self.paddings = [1, 1]
 
     def init_pool_type(self):
@@ -235,6 +362,9 @@ class TestCase2(TestPool2D_Op):
     def init_global_pool(self):
         self.global_pool = False
 
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
 
 class TestCase3(TestPool2D_Op):
     def init_pool_type(self):
@@ -366,5 +496,715 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+#-------test pool2d with asymmetric padding-----
+
+
+class TestPool2D_AsyPadding(TestPool2D_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+
+class TestCase1_AsyPadding(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+class TestCase2_AsyPadding(TestCase2):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+class TestCase3_AsyPadding(TestCase3):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+
+class TestCase4_AsyPadding(TestCase4):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+class TestCase5_AsyPadding((TestCase5)):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [2, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+create_test_cudnn_class(TestPool2D_AsyPadding)
+create_test_cudnn_class(TestCase1_AsyPadding)
+create_test_cudnn_class(TestCase2_AsyPadding)
+create_test_cudnn_class(TestCase3_AsyPadding)
+create_test_cudnn_class(TestCase4_AsyPadding)
+create_test_cudnn_class(TestCase5_AsyPadding)
+
+create_test_cudnn_fp16_class(TestPool2D_AsyPadding)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding)
+
+create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
+
+create_test_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase2_AsyPadding)
+
+
+class TestAvgInclude_AsyPadding(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+class TestCUDNNAvgInclude_AsyPadding(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
+    def init_exclusive(self):
+        self.exclusive = False
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [2, 1, 1, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1, 0, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+#----------- test channel_last --------------
+class TestPool2D_channel_last(TestPool2D_Op):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+class TestCase1_channel_last(TestCase1):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase2_channel_last(TestCase2):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase3_channel_last(TestCase3):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+class TestCase4_channel_last(TestCase4):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase5_channel_last(TestCase5):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+create_test_cudnn_class(TestPool2D_channel_last)
+create_test_cudnn_class(TestCase1_channel_last)
+create_test_cudnn_class(TestCase2_channel_last)
+create_test_cudnn_class(TestCase3_channel_last)
+create_test_cudnn_class(TestCase4_channel_last)
+create_test_cudnn_class(TestCase5_channel_last)
+
+create_test_cudnn_fp16_class(TestPool2D_channel_last)
+create_test_cudnn_fp16_class(TestCase1_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_channel_last)
+create_test_cudnn_fp16_class(TestCase3_channel_last)
+create_test_cudnn_fp16_class(TestCase4_channel_last)
+create_test_cudnn_fp16_class(TestCase5_channel_last)
+
+create_test_cudnn_use_ceil_class(TestPool2D_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_channel_last)
+
+create_test_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase2_channel_last)
+
+
+class TestCase5_Max(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        if self.has_cudnn() and self.pool_type == "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=1.00)
+        elif self.pool_type == "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
+
+
+class TestCase5_channel_last_Max(TestCase5_Max):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+create_test_cudnn_class(TestCase5_Max)
+create_test_cudnn_class(TestCase5_channel_last_Max)
+
+
+class TestAvgInclude_channel_last(TestCase2_channel_last):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+create_test_cudnn_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase1_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
+
+create_test_cudnn_fp16_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(
+    TestCase1_AsyPadding_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding_channel_last)
+
+create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
+
+create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
+
+
+class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestCUDNNAvgInclude_AsyPadding_channel_last(
+        TestCUDNNAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+class TestAvgPoolAdaptive_AsyPadding_channel_last(
+        TestAvgPoolAdaptive_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+# test paddings: SAME VALID
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.paddings = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+create_test_padding_SAME_class(TestPool2D_Op)
+create_test_padding_SAME_class(TestCase1)
+create_test_padding_SAME_class(TestCase2)
+create_test_padding_SAME_class(TestCase3)
+create_test_padding_SAME_class(TestCase4)
+create_test_padding_SAME_class(TestCase5)
+
+create_test_padding_SAME_class(TestPool2D_channel_last)
+create_test_padding_SAME_class(TestCase1_channel_last)
+create_test_padding_SAME_class(TestCase2_channel_last)
+create_test_padding_SAME_class(TestCase3_channel_last)
+create_test_padding_SAME_class(TestCase4_channel_last)
+create_test_padding_SAME_class(TestCase5_channel_last)
+
+
+def create_test_cudnn_padding_SAME_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingSMAECase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingSAMEOp")
+    TestCUDNNPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingSMAECase
+
+
+create_test_cudnn_padding_SAME_class(TestPool2D_Op)
+create_test_cudnn_padding_SAME_class(TestCase1)
+create_test_cudnn_padding_SAME_class(TestCase2)
+create_test_cudnn_padding_SAME_class(TestCase3)
+create_test_cudnn_padding_SAME_class(TestCase4)
+create_test_cudnn_padding_SAME_class(TestCase5)
+
+create_test_cudnn_padding_SAME_class(TestPool2D_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase1_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase2_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase3_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase4_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase5_channel_last)
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+create_test_padding_VALID_class(TestPool2D_Op)
+create_test_padding_VALID_class(TestCase1)
+create_test_padding_VALID_class(TestCase2)
+create_test_padding_VALID_class(TestCase3)
+create_test_padding_VALID_class(TestCase4)
+create_test_padding_VALID_class(TestCase5)
+
+create_test_padding_VALID_class(TestPool2D_channel_last)
+create_test_padding_VALID_class(TestCase1_channel_last)
+create_test_padding_VALID_class(TestCase2_channel_last)
+create_test_padding_VALID_class(TestCase3_channel_last)
+create_test_padding_VALID_class(TestCase4_channel_last)
+create_test_padding_VALID_class(TestCase5_channel_last)
+
+
+def create_test_cudnn_padding_VALID_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingVALIDCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingVALIDOp")
+    TestCUDNNPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingVALIDCase
+
+
+create_test_cudnn_padding_VALID_class(TestPool2D_Op)
+create_test_cudnn_padding_VALID_class(TestCase1)
+create_test_cudnn_padding_VALID_class(TestCase2)
+create_test_cudnn_padding_VALID_class(TestCase3)
+create_test_cudnn_padding_VALID_class(TestCase4)
+create_test_cudnn_padding_VALID_class(TestCase5)
+
+create_test_cudnn_padding_VALID_class(TestPool2D_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase1_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase2_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase3_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase4_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase5_channel_last)
+
+
+# ----- test API
+class TestPool2dAPI(OpTest):
+    def test_api(self):
+        x_NHWC = np.random.random([2, 5, 5, 3]).astype("float32")
+        x_NCHW = np.random.random([2, 3, 5, 5]).astype("float32")
+
+        input_NHWC = fluid.layers.data(
+            name="input_NHWC",
+            shape=[2, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+
+        input_NCHW = fluid.layers.data(
+            name="input_NCHW",
+            shape=[2, 3, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+
+        ksize = [3, 3]
+        out_1 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding=[1, 1],
+            use_cudnn=False,
+            data_format="NHWC")
+
+        out_2 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            use_cudnn=False,
+            data_format="NHWC")
+
+        out_3 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
+            use_cudnn=False,
+            data_format="NCHW")
+
+        out_4 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[1, 2, 1, 0],
+            use_cudnn=False,
+            data_format="NCHW")
+        # test VALID
+        out_5 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NCHW")
+
+        out_6 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NHWC")
+
+        # test SAME
+        out_7 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=[4, 4],
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NCHW")
+
+        out_8 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=[4, 4],
+            pool_type="max",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NHWC")
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        [res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8] = exe.run(
+            fluid.default_main_program(),
+            feed={"input_NHWC": x_NHWC,
+                  "input_NCHW": x_NCHW},
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        assert np.allclose(
+            res_1,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[1, 1],
+                data_format="NHWC"))
+
+        assert np.allclose(
+            res_2,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 1, 1, 1],
+                data_format="NHWC"))
+        assert np.allclose(
+            res_3,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 1, 1, 1],
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+
+        assert np.allclose(
+            res_4,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 2, 1, 0],
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+
+        # VALID
+        assert np.allclose(
+            res_5,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[10, 20],  # any ele is ok
+                padding_algorithm="VALID",
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+        assert np.allclose(
+            res_6,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NHWC"))
+        # SAME
+        assert np.allclose(
+            res_7,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=[4, 4],
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+
+        assert np.allclose(
+            res_8,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=[4, 4],
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NHWC"))
+
+
+class TestPool2dAPI_Error(OpTest):
+    def test_api(self):
+        input_NHWC = fluid.layers.data(
+            name="input_NHWC",
+            shape=[2, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3]
+
+        # cudnn value error
+        def run_1():
+            out_1 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1],
+                use_cudnn=[0],
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_1)
+
+        # data_format value error
+        def run_2():
+            out_2 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1],
+                use_cudnn=False,
+                data_format="NHWCC")
+
+        self.assertRaises(ValueError, run_2)
+
+        # padding str value error
+        def run_3():
+            out_3 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALIDSAME",
+                use_cudnn=False,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_3)
+
+        # padding str valid and ceil_mode value error
+        def run_4():
+            out_4 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALID",
+                use_cudnn=False,
+                ceil_mode=True,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_4)
+
+        # padding with 8 ele. value error
+        def run_5():
+            out_5 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1]],
+                use_cudnn=False,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_5)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 5898c5a67ee..6bbc3981b1c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle.fluid as fluid
 
 
 def adaptive_start_index(index, input_size, output_size):
@@ -30,54 +31,155 @@ def adaptive_end_index(index, input_size, output_size):
     return int(np.ceil((index + 1) * input_size / output_size))
 
 
-def max_pool3D_forward_naive(x,
-                             ksize,
-                             strides,
-                             paddings,
-                             global_pool=0,
-                             ceil_mode=False,
-                             exclusive=True,
-                             adaptive=False):
-    N, C, D, H, W = x.shape
+def pool3D_forward_naive(x,
+                         ksize,
+                         strides,
+                         paddings,
+                         global_pool=0,
+                         ceil_mode=False,
+                         exclusive=True,
+                         adaptive=False,
+                         data_format='NCDHW',
+                         pool_type='max',
+                         padding_algorithm="EXPLICIT"):
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCDHW":
+                input_data_shape = x.shape[2:5]
+            elif data_format == "NDHWC":
+                input_data_shape = x.shape[1:4]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+
+    assert len(paddings) == 3 or len(paddings) == 6
+    is_sys = True if len(paddings) == 3 else False
+
+    N = x.shape[0]
+    C,D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
     if global_pool == 1:
         ksize = [D, H, W]
+        paddings = [0 for _ in range(len(paddings))]
+
+    pad_d_forth = paddings[0] if is_sys else paddings[0]
+    pad_d_back = paddings[0] if is_sys else paddings[1]
+    pad_h_up = paddings[1] if is_sys else paddings[2]
+    pad_h_down = paddings[1] if is_sys else paddings[3]
+    pad_w_left = paddings[2] if is_sys else paddings[4]
+    pad_w_right = paddings[2] if is_sys else paddings[5]
+
     if adaptive:
         D_out, H_out, W_out = ksize
     else:
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
+
+        D_out = (D - ksize[0] + pad_d_forth+pad_d_back + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode  else (D - ksize[0] + pad_d_forth+pad_d_back) // strides[0] + 1
+
+        H_out = (H - ksize[1] + pad_h_up + pad_h_down + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (H - ksize[1] + pad_h_up + pad_h_down) // strides[1] + 1
+
+        W_out = (W - ksize[2] + pad_w_left + pad_w_right + strides[2] - 1) // strides[2] + 1 \
+            if ceil_mode else (W - ksize[2] + pad_w_left + pad_w_right) // strides[2] + 1
+
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
     for k in range(D_out):
         if adaptive:
             d_start = adaptive_start_index(k, D, ksize[0])
             d_end = adaptive_end_index(k, D, ksize[0])
         else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+            d_start = np.max((k * strides[0] - pad_d_forth, 0))
+            d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth, D))
+
         for i in range(H_out):
             if adaptive:
                 h_start = adaptive_start_index(i, H, ksize[1])
                 h_end = adaptive_end_index(i, H, ksize[1])
             else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
+                h_start = np.max((i * strides[1] - pad_h_up, 0))
+                h_end = np.min((i * strides[1] + ksize[1] - pad_h_up, H))
+
             for j in range(W_out):
                 if adaptive:
                     w_start = adaptive_start_index(j, W, ksize[2])
                     w_end = adaptive_end_index(j, W, ksize[2])
                 else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+                    w_start = np.max((j * strides[2] - pad_w_left, 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left, W))
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
+                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
+                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
 
-                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+
+
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=True,
+                             adaptive=False):
+    out = pool3D_forward_naive(
+        x=x,
+        ksize=ksize,
+        strides=strides,
+        paddings=paddings,
+        global_pool=global_pool,
+        ceil_mode=ceil_mode,
+        exclusive=exclusive,
+        adaptive=adaptive,
+        data_format='NCDHW',
+        pool_type="max")
     return out
 
 
@@ -89,56 +191,24 @@ def avg_pool3D_forward_naive(x,
                              ceil_mode=False,
                              exclusive=True,
                              adaptive=False):
-    N, C, D, H, W = x.shape
-    if global_pool == 1:
-        ksize = [D, H, W]
-    if adaptive:
-        D_out, H_out, W_out = ksize
-    else:
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in range(D_out):
-        if adaptive:
-            d_start = adaptive_start_index(k, D, ksize[0])
-            d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in range(H_out):
-            if adaptive:
-                h_start = adaptive_start_index(i, H, ksize[1])
-                h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
-            for j in range(W_out):
-                if adaptive:
-                    w_start = adaptive_start_index(j, W, ksize[2])
-                    w_end = adaptive_end_index(j, W, ksize[2])
-                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
-
-                field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
-                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
-                                                            4)) / field_size
+    out = pool3D_forward_naive(
+        x=x,
+        ksize=ksize,
+        strides=strides,
+        paddings=paddings,
+        global_pool=global_pool,
+        ceil_mode=ceil_mode,
+        exclusive=exclusive,
+        adaptive=adaptive,
+        data_format='NCDHW',
+        pool_type="avg")
     return out
 
 
 class TestPool3d_Op(OpTest):
     def setUp(self):
         self.op_type = "pool3d"
-        self.use_cudnn = False
+        self.init_kernel_type()
         self.dtype = np.float32
         self.init_test_case()
         self.init_global_pool()
@@ -147,13 +217,15 @@ class TestPool3d_Op(OpTest):
         self.init_ceil_mode()
         self.init_exclusive()
         self.init_adaptive()
+        self.init_data_format()
+        self.init_shape()
 
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool3D_forward_naive(
+        output = pool3D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
+            self.pool_type).astype(self.dtype)
+
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -164,8 +236,7 @@ class TestPool3d_Op(OpTest):
             'global_pooling': self.global_pool,
             'use_cudnn': self.use_cudnn,
             'ceil_mode': self.ceil_mode,
-            'data_format':
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
+            'data_format': self.data_format,
             'exclusive': self.exclusive,
             'adaptive': self.adaptive
         }
@@ -192,18 +263,23 @@ class TestPool3d_Op(OpTest):
         elif self.pool_type != "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
-    def init_test_case(self):
+    def init_data_format(self):
+        self.data_format = "NCDHW"
+
+    def init_shape(self):
         self.shape = [2, 3, 5, 5, 5]
+
+    def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
     def init_kernel_type(self):
-        pass
+        self.use_cudnn = False
+        #pass
 
     def init_pool_type(self):
         self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
 
     def init_global_pool(self):
         self.global_pool = True
@@ -219,30 +295,32 @@ class TestPool3d_Op(OpTest):
 
 
 class TestCase1(TestPool3d_Op):
-    def init_test_case(self):
+    def init_shape(self):
         self.shape = [2, 3, 7, 7, 7]
+
+    def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
     def init_pool_type(self):
         self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
 
     def init_global_pool(self):
         self.global_pool = False
 
 
 class TestCase2(TestPool3d_Op):
-    def init_test_case(self):
+    def init_shape(self):
         self.shape = [2, 3, 7, 7, 7]
+
+    def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
     def init_pool_type(self):
         self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
 
     def init_global_pool(self):
         self.global_pool = False
@@ -251,158 +329,824 @@ class TestCase2(TestPool3d_Op):
 class TestCase3(TestPool3d_Op):
     def init_pool_type(self):
         self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase4(TestCase1):
     def init_pool_type(self):
         self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase5(TestCase2):
     def init_pool_type(self):
         self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
-#--------------------test pool3d--------------------
-class TestCUDNNCase1(TestPool3d_Op):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+#--------------------test pool3d cudnn--------------------
 
 
-class TestFP16CUDNNCase1(TestPool3d_Op):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
+def create_test_cudnn_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOp")
+    TestCUDNNCase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNCase
 
 
-class TestCUDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+create_test_cudnn_class(TestPool3d_Op)
+create_test_cudnn_class(TestCase1)
+create_test_cudnn_class(TestCase2)
+create_test_cudnn_class(TestCase3)
+create_test_cudnn_class(TestCase4)
+create_test_cudnn_class(TestCase5)
 
 
-class TestFP16CUDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
+def create_test_cudnn_fp16_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.float16
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=1e-3)
 
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
+    TestCUDNNFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNFp16Case
 
-class TestCUDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
 
+create_test_cudnn_fp16_class(TestPool3d_Op)
+create_test_cudnn_fp16_class(TestCase1)
+create_test_cudnn_fp16_class(TestCase2)
+create_test_cudnn_fp16_class(TestCase3)
+create_test_cudnn_fp16_class(TestCase4)
+create_test_cudnn_fp16_class(TestCase5)
 
-class TestFP16CUDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+# ---- test ceil mode ------
+def create_test_cudnn_use_ceil_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestPool3DUseCeilCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
 
+        def init_ceil_mode(self):
+            self.ceil_mode = True
 
-class TestCUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOpCeilMode")
+    TestPool3DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestPool3DUseCeilCase
 
 
-class TestFP16CUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
+create_test_cudnn_use_ceil_class(TestPool3d_Op)
+create_test_cudnn_use_ceil_class(TestCase1)
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
 
+def create_test_use_ceil_class(parent):
+    class TestPool3DUseCeilCase(parent):
+        def init_ceil_mode(self):
+            self.ceil_mode = True
 
-class TestCUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+    cls_name = "{0}_{1}".format(parent.__name__, "CeilModeCast")
+    TestPool3DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestPool3DUseCeilCase
 
 
-class TestFP16CUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
+create_test_use_ceil_class(TestCase1)
+create_test_use_ceil_class(TestCase2)
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+
+class TestAvgInclude(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
 
 
-class TestCUDNNCase6(TestCase5):
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude(TestCase2):
     def init_kernel_type(self):
         self.use_cudnn = True
 
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+#-------test pool3d with asymmetric padding------
+
+
+class TestPool3d_Op_AsyPadding(TestPool3d_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0, 2, 3, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
+
+
+class TestCase1_AsyPadding(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+
+
+class TestCase2_AsyPadding(TestCase2):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+
+
+class TestCase3_AsyPadding(TestCase3):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 0, 0, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
 
-class TestFP16CUDNNCase6(TestCase5):
+
+class TestCase4_AsyPadding(TestCase4):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+
+
+class TestCase5_AsyPadding(TestCase5):
+    def init_test_case(self):
+        self.shape = [2, 7, 7, 7, 3]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+
+
+create_test_cudnn_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_class(TestCase1_AsyPadding)
+create_test_cudnn_class(TestCase2_AsyPadding)
+create_test_cudnn_class(TestCase3_AsyPadding)
+create_test_cudnn_class(TestCase4_AsyPadding)
+create_test_cudnn_class(TestCase5_AsyPadding)
+
+create_test_cudnn_fp16_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding)
+
+create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
+
+create_test_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase2_AsyPadding)
+
+
+class TestAvgInclude_AsyPadding(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_AsyPadding(TestCase2):
     def init_kernel_type(self):
         self.use_cudnn = True
-        self.dtype = np.float16
 
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+    def init_exclusive(self):
+        self.exclusive = False
 
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 0, 0, 0, 0]
 
-class TestCeilModeCase1(TestCUDNNCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
 
 
-class TestCeilModeCase2(TestCUDNNCase2):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
 
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
 
-class TestCeilModeCase3(TestCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
 
 
-class TestCeilModeCase4(TestCase2):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+# ------------ test channel_last --------------
+class TestPool3d_channel_last(TestPool3d_Op):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
 
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
 
-class TestAvgInclude(TestCase2):
+
+class TestCase1_channel_last(TestCase1):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase2_channel_last(TestCase2):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase3_channel_last(TestCase3):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+
+
+class TestCase4_channel_last(TestCase4):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase5_channel_last(TestCase5):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+create_test_cudnn_class(TestPool3d_channel_last)
+create_test_cudnn_class(TestCase1_channel_last)
+create_test_cudnn_class(TestCase2_channel_last)
+create_test_cudnn_class(TestCase3_channel_last)
+create_test_cudnn_class(TestCase4_channel_last)
+create_test_cudnn_class(TestCase5_channel_last)
+
+create_test_cudnn_use_ceil_class(TestPool3d_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_channel_last)
+
+create_test_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase2_channel_last)
+
+
+class TestCase5_Max(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        if self.has_cudnn() and self.pool_type == "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=1.00)
+        elif self.pool_type == "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
+
+
+class TestCase5_channel_last_Max(TestCase5_Max):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+create_test_cudnn_class(TestCase5_Max)
+create_test_cudnn_class(TestCase5_channel_last_Max)
+
+
+class TestAvgInclude_channel_last(TestCase2_channel_last):
     def init_exclusive(self):
         self.exclusive = False
 
 
-class TestCUDNNAvgInclude(TestCUDNNCase3):
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
     def init_exclusive(self):
         self.exclusive = False
 
 
-class TestAvgPoolAdaptive(TestCase1):
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
     def init_adaptive(self):
         self.adaptive = True
 
 
+# --- asy padding
+class TestPool3d_Op_AsyPadding_channel_last(TestPool3d_Op_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+
+
+class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+
+
+class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+create_test_cudnn_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase1_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
+
+create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
+
+create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
+
+
+class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_AsyPadding_channel_last(
+        TestCUDNNAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+
+
+class TestAvgPoolAdaptive_AsyPadding_channel_last(
+        TestAvgPoolAdaptive_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+
+
+#test padding = SAME VALID
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.paddings = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+create_test_padding_SAME_class(TestPool3d_Op)
+create_test_padding_SAME_class(TestCase1)
+create_test_padding_SAME_class(TestCase2)
+create_test_padding_SAME_class(TestCase3)
+create_test_padding_SAME_class(TestCase4)
+create_test_padding_SAME_class(TestCase5)
+
+create_test_padding_SAME_class(TestPool3d_channel_last)
+create_test_padding_SAME_class(TestCase1_channel_last)
+create_test_padding_SAME_class(TestCase2_channel_last)
+create_test_padding_SAME_class(TestCase3_channel_last)
+create_test_padding_SAME_class(TestCase4_channel_last)
+create_test_padding_SAME_class(TestCase5_channel_last)
+
+
+def create_test_cudnn_padding_SAME_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingSMAECase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingSAMEOp")
+    TestCUDNNPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingSMAECase
+
+
+create_test_cudnn_padding_SAME_class(TestPool3d_Op)
+create_test_cudnn_padding_SAME_class(TestCase1)
+create_test_cudnn_padding_SAME_class(TestCase2)
+create_test_cudnn_padding_SAME_class(TestCase3)
+create_test_cudnn_padding_SAME_class(TestCase4)
+create_test_cudnn_padding_SAME_class(TestCase5)
+
+create_test_cudnn_padding_SAME_class(TestPool3d_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase1_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase2_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase3_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase4_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase5_channel_last)
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+create_test_padding_VALID_class(TestPool3d_Op)
+create_test_padding_VALID_class(TestCase1)
+create_test_padding_VALID_class(TestCase2)
+create_test_padding_VALID_class(TestCase3)
+create_test_padding_VALID_class(TestCase4)
+create_test_padding_VALID_class(TestCase5)
+
+create_test_padding_VALID_class(TestPool3d_channel_last)
+create_test_padding_VALID_class(TestCase1_channel_last)
+create_test_padding_VALID_class(TestCase2_channel_last)
+create_test_padding_VALID_class(TestCase3_channel_last)
+create_test_padding_VALID_class(TestCase4_channel_last)
+create_test_padding_VALID_class(TestCase5_channel_last)
+
+
+def create_test_cudnn_padding_VALID_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingVALIDCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingVALIDOp")
+    TestCUDNNPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingVALIDCase
+
+
+create_test_cudnn_padding_VALID_class(TestPool3d_Op)
+create_test_cudnn_padding_VALID_class(TestCase1)
+create_test_cudnn_padding_VALID_class(TestCase2)
+create_test_cudnn_padding_VALID_class(TestCase3)
+create_test_cudnn_padding_VALID_class(TestCase4)
+create_test_cudnn_padding_VALID_class(TestCase5)
+
+create_test_cudnn_padding_VALID_class(TestPool3d_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase1_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase2_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase3_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase4_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase5_channel_last)
+
+
+#test API
+class TestPool3dAPI(OpTest):
+    def test_api(self):
+        x_NDHWC = np.random.random([2, 5, 5, 5, 3]).astype("float32")
+        x_NCDHW = np.random.random([2, 3, 5, 5, 5]).astype("float32")
+
+        input_NDHWC = fluid.layers.data(
+            name="input_NDHWC",
+            shape=[2, 5, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+
+        input_NCDHW = fluid.layers.data(
+            name="input_NCDHW",
+            shape=[2, 3, 5, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+
+        ksize = [3, 3, 3]
+        out_1 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding=[1, 1, 1],
+            use_cudnn=False,
+            data_format="NDHWC")
+
+        out_2 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
+            use_cudnn=False,
+            data_format="NDHWC")
+
+        out_3 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
+            use_cudnn=False,
+            data_format="NCDHW")
+
+        out_4 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[1, 2, 1, 0, 0, 1],
+            use_cudnn=False,
+            data_format="NCDHW")
+        # test VALID
+        out_5 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NDHWC")
+
+        out_6 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NCDHW")
+
+        # test SAME
+        out_7 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NDHWC")
+
+        out_8 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=[4, 4, 4],
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NCDHW")
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        [res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8] = exe.run(
+            fluid.default_main_program(),
+            feed={"input_NDHWC": x_NDHWC,
+                  "input_NCDHW": x_NCDHW},
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        assert np.allclose(
+            res_1,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1],
+                data_format="NDHWC"))
+
+        assert np.allclose(
+            res_2,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1, 1, 1, 1],
+                data_format="NDHWC"))
+        assert np.allclose(
+            res_3,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1, 1, 1, 1],
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+
+        assert np.allclose(
+            res_4,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 2, 1, 0, 0, 1],
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+        # VALID
+        assert np.allclose(
+            res_5,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NDHWC"))
+
+        assert np.allclose(
+            res_6,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+        # SAME
+        assert np.allclose(
+            res_7,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NDHWC"))
+
+        assert np.allclose(
+            res_8,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=[4, 4, 4],
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+
+
+class TestPool3dAPI_Error(OpTest):
+    def test_api(self):
+        input_NDHWC = fluid.layers.data(
+            name="input_NDHWC",
+            shape=[2, 5, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3, 3]
+
+        # cudnn value error
+        def run_1():
+            out_1 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1, 1],
+                use_cudnn=[0],
+                data_format="NDHWC")
+
+        self.assertRaises(ValueError, run_1)
+
+        # data_format value error
+        def run_2():
+            out_2 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1, 1],
+                use_cudnn=False,
+                data_format="NDHWCC")
+
+        self.assertRaises(ValueError, run_2)
+
+        # padding str value error
+        def run_3():
+            out_3 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALIDSAME",
+                use_cudnn=False,
+                data_format="NDHWC")
+
+        self.assertRaises(ValueError, run_3)
+
+        # padding str valid and ceil_mode value error
+        def run_4():
+            out_4 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALID",
+                use_cudnn=False,
+                ceil_mode=True,
+                data_format="NDHWC")
+
+        self.assertRaises(ValueError, run_4)
+
+        # padding with 8 ele. value error
+        def run_5():
+            out_5 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1], [1, 1]],
+                use_cudnn=False,
+                data_format="NDHWC")
+
+        self.assertRaises(ValueError, run_5)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab