fix pool2d pool3d,support asymmetric padding and channel_last (#19739)

* fix pool2d pool3d: 1. support asymmetric padding; 2. support padding algorithm:"SAME" and "VALID"; 3. support channel_last: data_format NHWC and NDHWC; 4. support inferring shape when input with negative dims in compile time; 5. change doc of python API and c++; 6. fix bug in cuda kernel when Attr(adaptive) is true. test=develop,test=document_preview * fix 'tensors' to 'Tensors'. test=develop,test=document_preview * add test for converage ValueError.test=develop,test=document_preview * resolve conflict in test_pool2d. test=develop

fix pool2d pool3d,support asymmetric padding and channel_last (#19739)
* fix pool2d pool3d: 1. support asymmetric padding; 2. support padding algorithm:"SAME" and "VALID"; 3. support channel_last: data_format NHWC and NDHWC; 4. support inferring shape when input with negative dims in compile time; 5. change doc of python API and c++; 6. fix bug in cuda kernel when Attr(adaptive) is true. test=develop,test=document_preview * fix 'tensors' to 'Tensors'. test=develop,test=document_preview * add test for converage ValueError.test=develop,test=document_preview * resolve conflict in test_pool2d. test=develop
24010472 · liym27 · Tao Luo · fe581b0e · 24010472 · 24010472
12 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -144,8 +144,8 @@ paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size',
 paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82'))
 paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711'))
 paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5'))
-paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'be7e530dcbd603962e25573a63eb145e'))
+paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive', 'data_format'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True, 'NCHW')), ('document', '630cae697d46b4b575b15d56cf8be25a'))
-paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6'))
+paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive', 'data_format'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True, 'NCDHW')), ('document', 'db0035a3132b1dfb12e53c57591fb9f6'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
 paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '9e5a9f4f6d82d34a33d9ca632379cbcc'))

--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include <algorithm>
+#include <string>
 #include <vector>
+#include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 namespace math {
 /*
- * All tensors are in NCHW format.
+* Tensors are in NCHW or NHWC format.
- * Ksize, strides, paddings are two elements. These two elements represent
+* Ksize, strides are two elements. These two elements represent height
- * height and width, respectively.
+* and width, respectively.
- */
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 public:
@@ -92,12 +96,137 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      }
    }
  }
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int output_channels =
+        channel_last ? output->dims()[3] : output->dims()[1];
+    const int output_height =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_width =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              T ele = pool_process.initial();
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_process.compute(input_data[h * input_width + w], &ele);
+                }
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[ph * output_width + pw] = ele;
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              T ele = pool_process.initial();
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_process.compute(
+                      input_data[h * input_width * input_channels +
+                                 w * input_channels + c],
+                      &ele);
+                }
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[ph * output_width * output_channels +
+                          pw * output_channels + c] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
 };
 /*
-* All tensors are in NCHW format.
+* tensors are in NCHW or NHWC format.
-* Ksize, strides, paddings are two elements. These two elements represent height
+* Ksize, strides are two elements. These two elements represent height
 * and width, respectively.
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
 */
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
@@ -173,13 +302,147 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      }
    }
  }
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      PoolProcess pool_grad_process, bool exclusive, bool adaptive,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  pool_grad_process.compute(
+                      input_data[h * input_width + w],
+                      output_data[ph * output_width + pw],
+                      output_grad_data[ph * output_width + pw],
+                      static_cast<T>(scale),
+                      input_grad_data + h * input_width + w);
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              int pool_size = (exclusive || adaptive)
+                                  ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  auto input_idx =
+                      h * input_width * input_channels + w * input_channels + c;
+                  auto output_idx = ph * output_width * output_channels +
+                                    pw * output_channels + c;
+                  pool_grad_process.compute(
+                      input_data[input_idx], output_data[output_idx],
+                      output_grad_data[output_idx], static_cast<T>(scale),
+                      input_grad_data + input_idx);
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
 };
 /*
- * All tensors are in NCHW format.
+* Tensors are in NCHW or NHWC format.
- * Ksize, strides, paddings are two elements. These two elements represent
+* Ksize, strides are two elements. These two elements represent height
- * height and width, respectively.
+* and width, respectively.
- */
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
 public:
@@ -239,53 +502,424 @@ class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
      }
    }
  }
-};
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+  void operator()(
-template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
-template class Pool2dFunctor<platform::CPUDeviceContext,
+    const int batch_size = input.dims()[0];
-                             paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
-                             paddle::operators::math::AvgPool<float>, float>;
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
-                                 paddle::operators::math::MaxPoolGrad<float>,
-                                 float>;
+    const int output_channels =
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
+        channel_last ? output.dims()[3] : output.dims()[1];
-                                 paddle::operators::math::AvgPoolGrad<float>,
+    const int output_height =
-                                 float>;
+        channel_last ? output.dims()[1] : output.dims()[2];
-template class Pool2dFunctor<platform::CPUDeviceContext,
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
-                             paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUDeviceContext,
+    const int ksize_height = ksize[0];
-                             paddle::operators::math::AvgPool<double>, double>;
+    const int ksize_width = ksize[1];
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
+    const int stride_height = strides[0];
-                                 double>;
+    const int stride_width = strides[1];
-template class Pool2dGradFunctor<platform::CPUDeviceContext,
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    if (!channel_last) {
+      const int input_stride = input_height * input_width;
+      const int output_stride = output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int h = hstart; h < hend && !stop; ++h) {
+                for (int w = wstart; w < wend && !stop; ++w) {
+                  int input_idx = h * input_width + w;
+                  int output_idx = ph * output_width + pw;
+                  if (input_data[input_idx] == output_data[output_idx]) {
+                    input_grad_data[input_idx] += output_grad_data[output_idx];
+                    stop = true;
+                  }
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride = input_height * input_width * input_channels;
+      const int output_stride = output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int h = hstart; h < hend && !stop; ++h) {
+                for (int w = wstart; w < wend && !stop; ++w) {
+                  int input_idx =
+                      h * input_width * input_channels + w * input_channels + c;
+                  int output_idx = ph * output_width * output_channels +
+                                   pw * output_channels + c;
+                  if (input_data[input_idx] == output_data[output_idx]) {
+                    input_grad_data[input_idx] += output_grad_data[output_idx];
+                    stop = true;
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
                                 paddle::operators::math::AvgPoolGrad<double>,
                                 double>;
 /*
- * All tensors are in NCDHW format.
+* Tensors are in NCDHW or NDHWC format.
- * Ksize, strides, paddings are three elements. These three elements represent
+* Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
+* depth, height and width, respectively.
- */
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
+          for (int ph = 0; ph < output_height; ++ph) {
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
+            for (int pw = 0; pw < output_width; ++pw) {
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        input_data[(d * input_height + h) * input_width + w],
+                        &ele);
+                  }
+                }
+              }
+              int pool_size =
+                  (exclusive || adaptive)
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+    const int output_channels =
+        channel_last ? output->dims()[4] : output->dims()[1];
+    const int output_depth =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_height =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int output_width =
+        channel_last ? output->dims()[3] : output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+                int output_idx = (pd * output_height + ph) * output_width + pw;
+                T ele = pool_process.initial();
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      pool_process.compute(
+                          input_data[(d * input_height + h) * input_width + w],
+                          &ele);
+                    }
+                  }
+                }
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                pool_process.finalize(static_cast<T>(pool_size), &ele);
+                output_data[output_idx] = ele;
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+                T ele = pool_process.initial();
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      pool_process.compute(input_data[input_idx], &ele);
+                    }
+                  }
+                }
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                pool_process.finalize(static_cast<T>(pool_size), &ele);
+                int output_idx =
+                    ((pd * output_height + ph) * output_width + pw) *
+                        output_channels +
+                    c;
+                output_data[output_idx] = ele;
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+/*
+* Tensors are in NCDHW or NDHWC format.
+* Ksize, strides, paddings are three elements. These three elements represent
+* depth, height and width, respectively.
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(
-                  const framework::Tensor& input, const std::vector<int>& ksize,
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
-                  const std::vector<int>& strides,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
-                  const std::vector<int>& paddings, PoolProcess pool_process,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
-                  bool exclusive, bool adaptive, framework::Tensor* output) {
+      const std::vector<int>& paddings, PoolProcess pool_grad_process,
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
+    const int output_channels = output.dims()[1];
-    const int output_depth = output->dims()[2];
+    const int output_depth = output.dims()[2];
-    const int output_height = output->dims()[3];
+    const int output_height = output.dims()[3];
-    const int output_width = output->dims()[4];
+    const int output_width = output.dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -295,12 +929,13 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    const int padding_depth = paddings[0];
    const int padding_height = paddings[1];
    const int padding_width = paddings[2];
    const int input_stride = input_depth * input_height * input_width;
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    int dstart, dend;
    int hstart, hend;
@@ -334,66 +969,68 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                wend = std::min(wstart + ksize_width, input_width);
                wstart = std::max(wstart, 0);
              }
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = pool_process.initial();
+              int pool_size =
+                  (exclusive || adaptive)
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
+              float scale = 1.0 / pool_size;
              for (int d = dstart; d < dend; ++d) {
                for (int h = hstart; h < hend; ++h) {
                  for (int w = wstart; w < wend; ++w) {
-                    pool_process.compute(
+                    int input_idx = (d * input_height + h) * input_width + w;
-                        input_data[(d * input_height + h) * input_width + w],
+                    int output_idx =
-                        &ele);
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx], static_cast<T>(scale),
+                        input_grad_data + input_idx);
                  }
                }
              }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
-              pool_process.finalize(static_cast<T>(pool_size), &ele);
-              output_data[output_idx] = ele;
            }
          }
        }
        input_data += input_stride;
        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
      }
    }
  }
-};
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
- public:
  void operator()(
      const platform::CPUDeviceContext& context, const framework::Tensor& input,
      const framework::Tensor& output, const framework::Tensor& output_grad,
      const std::vector<int>& ksize, const std::vector<int>& strides,
-      const std::vector<int>& paddings, PoolProcess pool_grad_process,
+      const std::vector<int>& paddings, const std::string data_format,
-      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
+      PoolProcess pool_grad_process, bool exclusive, bool adaptive,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
-    const int input_height = input.dims()[3];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
-    const int input_width = input.dims()[4];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
-    const int output_channels = output.dims()[1];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
+    const int output_channels =
-    const int output_width = output.dims()[4];
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
    const int stride_depth = strides[0];
    const int stride_height = strides[1];
    const int stride_width = strides[2];
    const int padding_depth = paddings[0];
    const int padding_height = paddings[1];
    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
@@ -403,6 +1040,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    int dstart, dend;
    int hstart, hend;
    int wstart, wend;
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
      for (int i = 0; i < batch_size; i++) {
        for (int c = 0; c < output_channels; ++c) {
          for (int pd = 0; pd < output_depth; ++pd) {
@@ -460,14 +1100,85 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
          output_grad_data += output_stride;
        }
      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            if (adaptive) {
+              dstart = AdaptStartIndex(pd, input_depth, output_depth);
+              dend = AdaptEndIndex(pd, input_depth, output_depth);
+            } else {
+              dstart = pd * stride_depth - padding_depth;
+              dend = std::min(dstart + ksize_depth, input_depth);
+              dstart = std::max(dstart, 0);
+            }
+            for (int ph = 0; ph < output_height; ++ph) {
+              if (adaptive) {
+                hstart = AdaptStartIndex(ph, input_height, output_height);
+                hend = AdaptEndIndex(ph, input_height, output_height);
+              } else {
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height, input_height);
+                hstart = std::max(hstart, 0);
+              }
+              for (int pw = 0; pw < output_width; ++pw) {
+                if (adaptive) {
+                  wstart = AdaptStartIndex(pw, input_width, output_width);
+                  wend = AdaptEndIndex(pw, input_width, output_width);
+                } else {
+                  wstart = pw * stride_width - padding_width;
+                  wend = std::min(wstart + ksize_width, input_width);
+                  wstart = std::max(wstart, 0);
+                }
+                int pool_size =
+                    (exclusive || adaptive)
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
+                float scale = 1.0 / pool_size;
+                for (int d = dstart; d < dend; ++d) {
+                  for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      int output_idx =
+                          ((pd * output_height + ph) * output_width + pw) *
+                              output_channels +
+                          c;
+                      pool_grad_process.compute(
+                          input_data[input_idx], output_data[output_idx],
+                          output_grad_data[output_idx], static_cast<T>(scale),
+                          input_grad_data + input_idx);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
  }
 };
 /*
- * All tensors are in NCDHW format.
+* Tensors are in NCDHW or NDHWC format.
- * Ksize, strides, paddings are three elements. These three elements represent
+* Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
+* depth, height and width, respectively.
- */
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
 public:
@@ -541,8 +1252,139 @@ class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
      }
    }
  }
-};
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::string data_format,
+      framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    if (!channel_last) {
+      const int input_stride = input_depth * input_height * input_width;
+      const int output_stride = output_depth * output_height * output_width;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+            for (int ph = 0; ph < output_height; ++ph) {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+              for (int pw = 0; pw < output_width; ++pw) {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+                bool stop = false;
+                for (int d = dstart; d < dend && !stop; ++d) {
+                  for (int h = hstart; h < hend && !stop; ++h) {
+                    for (int w = wstart; w < wend && !stop; ++w) {
+                      int input_idx = (d * input_height + h) * input_width + w;
+                      int output_idx =
+                          (pd * output_height + ph) * output_width + pw;
+                      if (input_data[input_idx] == output_data[output_idx]) {
+                        input_grad_data[input_idx] +=
+                            output_grad_data[output_idx];
+                        stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+          input_data += input_stride;
+          output_data += output_stride;
+          input_grad_data += input_stride;
+          output_grad_data += output_stride;
+        }
+      }
+    } else {
+      const int input_stride =
+          input_depth * input_height * input_width * input_channels;
+      const int output_stride =
+          output_depth * output_height * output_width * output_channels;
+      for (int i = 0; i < batch_size; i++) {
+        for (int c = 0; c < output_channels; ++c) {
+          for (int pd = 0; pd < output_depth; ++pd) {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+            for (int ph = 0; ph < output_height; ++ph) {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+              for (int pw = 0; pw < output_width; ++pw) {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+                bool stop = false;
+                for (int d = dstart; d < dend && !stop; ++d) {
+                  for (int h = hstart; h < hend && !stop; ++h) {
+                    for (int w = wstart; w < wend && !stop; ++w) {
+                      int input_idx =
+                          ((d * input_height + h) * input_width + w) *
+                              input_channels +
+                          c;
+                      int output_idx =
+                          ((pd * output_height + ph) * output_width + pw) *
+                              output_channels +
+                          c;
+                      if (input_data[input_idx] == output_data[output_idx]) {
+                        input_grad_data[input_idx] +=
+                            output_grad_data[output_idx];
+                        stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
 template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;

--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,13 +29,22 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                             const int ksize_width, const int stride_height,
                             const int stride_width, const int padding_height,
                             const int padding_width, PoolProcess pool_process,
-                             bool exclusive, bool adaptive, T* output_data) {
+                             bool exclusive, bool adaptive, T* output_data,
+                             bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
+    int pw, ph, c, batch_idx;
-    int ph = (index / output_width) % output_height;
+    if (!channel_last) { /*NCHW*/
-    int c = (index / output_width / output_height) % channels;
+      pw = index % output_width;
-    int batch_idx = index / output_width / output_height / channels;
+      ph = (index / output_width) % output_height;
+      c = (index / output_width / output_height) % channels;
+      batch_idx = index / output_width / output_height / channels;
+    } else { /*NHWC*/
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      batch_idx = index / channels / output_width / output_height;
+    }
    int hstart, hend;
    int wstart, wend;
@@ -55,11 +64,17 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
      wstart = max(wstart, 0);
    }
+    if (!channel_last) {
      input_data += (batch_idx * channels + c) * input_height * input_width;
+    } else {
+      input_data += batch_idx * input_height * input_width * channels;
+    }
    T ele = pool_process.initial();
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        pool_process.compute(input_data[h * input_width + w], &ele);
+        auto input_idx = channel_last ? (h * input_width + w) * channels + c
+                                      : h * input_width + w;
+        pool_process.compute(input_data[input_idx], &ele);
      }
    }
    int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
@@ -68,7 +83,6 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
    output_data[index] = ele;
  }
 }
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
    const int nthreads, const T* input_data, const T* output_data,
@@ -76,13 +90,23 @@ __global__ void KernelPool2DGrad(
    const int input_width, const int output_height, const int output_width,
    const int ksize_height, const int ksize_width, const int stride_height,
    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad,
+    bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
+    int w_offset, h_offset, offsetC, batch_idx;
-    int h_offset = (index / input_width) % input_height + padding_height;
+    if (!channel_last) { /* NCHW */
-    int offsetC = (index / input_width / input_height) % channels;
+      w_offset = index % input_width + padding_width;
-    int batch_idx = index / input_width / input_height / channels;
+      h_offset = (index / input_width) % input_height + padding_height;
+      offsetC = (index / input_width / input_height) % channels;
+      batch_idx = index / input_width / input_height / channels;
+    } else { /* NHWC */
+      offsetC = index % channels;
+      w_offset = (index / channels) % input_width + padding_width;
+      h_offset =
+          (index / channels / input_width) % input_height + padding_height;
+      batch_idx = index / channels / input_width / input_height;
+    }
    int phstart, phend;
    int pwstart, pwend;
@@ -105,10 +129,18 @@ __global__ void KernelPool2DGrad(
    }
    T gradient = 0;
    T input = input_data[index];
-    int output_idx =
+    int output_stride;
+    if (!channel_last) {
+      output_stride =
          (batch_idx * channels + offsetC) * output_height * output_width;
-    output_data += output_idx;
+    } else {
-    output_grad += output_idx;
+      output_stride = batch_idx * output_height * output_width * channels;
+    }
+    output_data += output_stride;
+    output_grad += output_stride;
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
        int pool_size;
@@ -127,7 +159,9 @@ __global__ void KernelPool2DGrad(
          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                : ksize_height * ksize_width;
        }
-        int output_sub_idx = ph * output_width + pw;
+        int output_sub_idx = channel_last
+                                 ? (ph * output_width + pw) * channels + offsetC
+                                 : ph * output_width + pw;
        pool_process.compute(input, output_data[output_sub_idx],
                             output_grad[output_sub_idx],
                             static_cast<T>(1.0 / pool_size), &gradient);
@@ -144,14 +178,21 @@ __global__ void KernelMaxPool2DGrad(
    const int input_width, const int output_height, const int output_width,
    const int ksize_height, const int ksize_width, const int stride_height,
    const int stride_width, const int padding_height, const int padding_width,
-    T* input_grad) {
+    T* input_grad, bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
+    int pw, ph, c, batch_idx;
-    int ph = (index / output_width) % output_height;
+    if (!channel_last) { /* NCHW */
-    int c = (index / output_width / output_height) % channels;
+      pw = index % output_width;
-    int batch_idx = index / output_width / output_height / channels;
+      ph = (index / output_width) % output_height;
+      c = (index / output_width / output_height) % channels;
+      batch_idx = index / output_width / output_height / channels;
+    } else { /* NHWC */
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      batch_idx = index / channels / output_width / output_height;
+    }
    int hstart = ph * stride_height - padding_height;
    int hend = min(hstart + ksize_height, input_height);
    hstart = max(hstart, 0);
@@ -160,16 +201,24 @@ __global__ void KernelMaxPool2DGrad(
    int wend = min(wstart + ksize_width, input_width);
    wstart = max(wstart, 0);
-    input_data += (batch_idx * channels + c) * input_height * input_width;
+    int input_stride;
-    input_grad += (batch_idx * channels + c) * input_height * input_width;
+    if (!channel_last) {
+      input_stride = (batch_idx * channels + c) * input_height * input_width;
+    } else {
+      input_stride = batch_idx * input_height * input_width * channels;
+    }
+    input_data += input_stride;
+    input_grad += input_stride;
    T ele = output_data[index];
    int maxIndex = -1;
    bool stop = false;
    for (int h = hstart; h < hend && !stop; ++h) {
      for (int w = wstart; w < wend && !stop; ++w) {
-        if (ele == input_data[h * input_width + w]) {
+        int input_data_idx = channel_last ? (h * input_width + w) * channels + c
-          maxIndex = h * input_width + w;
+                                          : h * input_width + w;
+        if (ele == input_data[input_data_idx]) {
+          maxIndex = input_data_idx;
          stop = true;
        }
      }
@@ -214,10 +263,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 }
 /*
- * All tensors are in NCHW format.
+* Tensors are in NCHW or NHWC format.
- * Ksize, strides, paddings are two elements. These two elements represent
+* Ksize, strides are two elements. These two elements represent height
- * height and width, respectively.
+* and width, respectively.
- */
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -254,13 +305,57 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        stride_width, padding_height, padding_width, pool_process, exclusive,
        adaptive, output_data);
  }
-};
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int output_channels =
+        channel_last ? output->dims()[3] : output->dims()[1];
+    const int output_height =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_width =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, exclusive,
+        adaptive, output_data, channel_last);
+  }
+};
 /*
- * All tensors are in NCHW format.
+* Tensors are in NCHW or NHWC format.
- * Ksize, strides, paddings are two elements. These two elements represent
+* Ksize, strides are two elements. These two elements represent height
- * height and width, respectively.
+* and width, respectively.
- */
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -302,13 +397,62 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        ksize_width, stride_height, stride_width, padding_height, padding_width,
        pool_process, exclusive, adaptive, input_grad_data);
  }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, PoolProcess pool_process, bool exclusive,
+      bool adaptive, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, exclusive, adaptive, input_grad_data, channel_last);
+  }
 };
 /*
- * All tensors are in NCHW format.
+* Tensors are in NCHW or NHWC format.
- * Ksize, strides, paddings are two elements. These two elements represent
+* Ksize, strides are two elements. These two elements represent height
- * height and width, respectively.
+* and width, respectively.
- */
+* Paddings are four elements. These four elements represent height_up,
+* height_down, width_left and width_right, respectively.
+*/
 template <typename T>
 class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
 public:
@@ -350,6 +494,51 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
        ksize_width, stride_height, stride_width, padding_height, padding_width,
        input_grad_data);
  }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
+    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int output_channels =
+        channel_last ? output.dims()[3] : output.dims()[1];
+    const int output_height =
+        channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_width = channel_last ? output.dims()[2] : output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data, channel_last);
+  }
 };
 template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
@@ -389,15 +578,26 @@ __global__ void KernelPool3D(
    const int ksize_depth, const int ksize_height, const int ksize_width,
    const int stride_depth, const int stride_height, const int stride_width,
    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data,
+    bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
+    int pw, ph, pd, c, batch_idx;
-    int ph = (index / output_width) % output_height;
+    if (!channel_last) {
-    int pd = (index / output_width / output_height) % output_depth;
+      pw = index % output_width;
-    int c = (index / output_width / output_height / output_depth) % channels;
+      ph = (index / output_width) % output_height;
-    int batch_idx =
+      pd = (index / output_width / output_height) % output_depth;
+      c = (index / output_width / output_height / output_depth) % channels;
+      batch_idx =
          index / output_width / output_height / output_depth / channels;
+    } else {
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      pd = (index / channels / output_width / output_height) % output_depth;
+      batch_idx =
+          index / channels / output_width / output_height / output_depth;
+    }
    int dstart, dend;
    int hstart, hend;
@@ -422,14 +622,26 @@ __global__ void KernelPool3D(
      hstart = max(hstart, 0);
      wstart = max(wstart, 0);
    }
-    T ele = pool_process.initial();
-    input_data +=
+    int input_data_stride;
+    if (!channel_last) { /* NCDHW */
+      input_data_stride =
          (batch_idx * channels + c) * input_depth * input_height * input_width;
+    } else { /* NDHWC */
+      input_data_stride =
+          batch_idx * input_depth * input_height * input_width * channels;
+    }
+    input_data += input_data_stride;
+    T ele = pool_process.initial();
    for (int d = dstart; d < dend; ++d) {
      for (int h = hstart; h < hend; ++h) {
        for (int w = wstart; w < wend; ++w) {
-          pool_process.compute(
+          auto input_data_idx =
-              input_data[(d * input_height + h) * input_width + w], &ele);
+              channel_last
+                  ? ((d * input_height + h) * input_width + w) * channels + c
+                  : (d * input_height + h) * input_width + w;
+          pool_process.compute(input_data[input_data_idx], &ele);
        }
      }
    }
@@ -450,15 +662,27 @@ __global__ void KernelPool3DGrad(
    const int ksize_height, const int ksize_width, const int stride_depth,
    const int stride_height, const int stride_width, const int padding_depth,
    const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, bool adaptive, T* input_grad) {
+    bool exclusive, bool adaptive, T* input_grad, bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width + padding_width;
+    int w_offset, h_offset, d_offset, offsetC, batch_idx;
-    int h_offset = (index / input_width) % input_height + padding_height;
+    if (!channel_last) { /* "NCDHW" */
-    int d_offset =
+      w_offset = index % input_width + padding_width;
+      h_offset = (index / input_width) % input_height + padding_height;
+      d_offset =
          (index / input_width / input_height) % input_depth + padding_depth;
-    int offsetC = (index / input_width / input_height / input_depth) % channels;
+      offsetC = (index / input_width / input_height / input_depth) % channels;
-    int batch_idx = index / input_width / input_height / input_depth / channels;
+      batch_idx = index / input_width / input_height / input_depth / channels;
+    } else { /* "NDHWC" */
+      offsetC = index % channels;
+      w_offset = (index / channels) % input_width + padding_width;
+      h_offset =
+          (index / channels / input_width) % input_height + padding_height;
+      d_offset = (index / channels / input_width / input_height) % input_depth +
+                 padding_depth;
+      batch_idx = index / channels / input_width / input_height / input_depth;
+    }
    int pdstart, pdend;
    int phstart, phend;
@@ -490,10 +714,17 @@ __global__ void KernelPool3DGrad(
    T gradient = 0;
    T input = input_data[index];
-    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+    int output_stride;
+    if (!channel_last) {
+      output_stride = (batch_idx * channels + offsetC) * output_depth *
                      output_height * output_width;
-    output_data += output_idx;
+    } else {
-    output_grad += output_idx;
+      output_stride =
+          batch_idx * output_depth * output_height * output_width * channels;
+    }
+    output_data += output_stride;
+    output_grad += output_stride;
    for (int pd = pdstart; pd < pdend; ++pd) {
      for (int ph = phstart; ph < phend; ++ph) {
@@ -522,7 +753,13 @@ __global__ void KernelPool3DGrad(
                exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                          : ksize_depth * ksize_height * ksize_width;
          }
-          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          int output_sub_idx =
+              channel_last
+                  ? ((pd * output_height + ph) * output_width + pw) * channels +
+                        offsetC
+                  : (pd * output_height + ph) * output_width + pw;
          pool_process.compute(input, output_data[output_sub_idx],
                               output_grad[output_sub_idx],
                               static_cast<T>(1.0 / pool_size), &gradient);
@@ -541,38 +778,64 @@ __global__ void KernelMaxPool3DGrad(
    const int output_height, const int output_width, const int ksize_depth,
    const int ksize_height, const int ksize_width, const int stride_depth,
    const int stride_height, const int stride_width, const int padding_depth,
-    const int padding_height, const int padding_width, T* input_grad) {
+    const int padding_height, const int padding_width, T* input_grad,
+    bool channel_last = false) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int pw = index % output_width;
+    int pw, ph, pd, c, batch_idx;
-    int ph = (index / output_width) % output_height;
-    int pd = (index / output_width / output_height) % output_depth;
+    if (!channel_last) { /*NCDHW*/
-    int c = (index / output_width / output_height / output_depth) % channels;
+      pw = index % output_width;
-    int batch_idx =
+      ph = (index / output_width) % output_height;
+      pd = (index / output_width / output_height) % output_depth;
+      c = (index / output_width / output_height / output_depth) % channels;
+      batch_idx =
          index / output_width / output_height / output_depth / channels;
+    } else { /*NDHWC*/
+      c = index % channels;
+      pw = (index / channels) % output_width;
+      ph = (index / channels / output_width) % output_height;
+      pd = (index / channels / output_width / output_height) % output_depth;
+      batch_idx =
+          index / channels / output_width / output_height / output_depth;
+    }
    int dstart = pd * stride_depth - padding_depth;
    int hstart = ph * stride_height - padding_height;
    int wstart = pw * stride_width - padding_width;
    int dend = min(dstart + ksize_depth, input_depth);
    int hend = min(hstart + ksize_height, input_height);
    int wend = min(wstart + ksize_width, input_width);
    dstart = max(dstart, 0);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    T ele = output_data[index];
    bool stop = false;
    int maxIdx = -1;
-    input_data +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
-    input_grad +=
-        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    int input_stride;
+    if (!channel_last) {
+      input_stride =
+          (batch_idx * channels + c) * input_depth * input_height * input_width;
+    } else {
+      input_stride =
+          batch_idx * input_depth * input_height * input_width * channels;
+    }
+    input_data += input_stride;
+    input_grad += input_stride;
    for (int d = dstart; d < dend && !stop; ++d) {
      for (int h = hstart; h < hend && !stop; ++h) {
        for (int w = wstart; w < wend && !stop; ++w) {
-          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+          int input_data_idx =
+              channel_last
+                  ? ((d * input_height + h) * input_width + w) * channels + c
+                  : (d * input_height + h) * input_width + w;
+          if (ele == input_data[input_data_idx]) {
            stop = true;
-            maxIdx = (d * input_height + h) * input_width + w;
+            maxIdx = input_data_idx;
          }
        }
      }
@@ -585,10 +848,13 @@ __global__ void KernelMaxPool3DGrad(
 }
 /*
- * All tensors are in NCDHW format.
+* Tensors are in NCDHW or NDHWC format.
- * Ksize, strides, paddings are three elements. These three elements represent
+* Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
+* depth, height and width, respectively.
- */
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -632,13 +898,67 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        padding_depth, padding_height, padding_width, pool_process, exclusive,
        adaptive, output_data);
  }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_process,
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+    const int output_channels =
+        channel_last ? output->dims()[4] : output->dims()[1];
+    const int output_depth =
+        channel_last ? output->dims()[1] : output->dims()[2];
+    const int output_height =
+        channel_last ? output->dims()[2] : output->dims()[3];
+    const int output_width =
+        channel_last ? output->dims()[3] : output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process, exclusive,
+        adaptive, output_data, channel_last);
+  }
 };
 /*
- * All tensors are in NCDHW format.
+* Tensors are in NCDHW or NDHWC format.
- * Ksize, strides, paddings are three elements. These three elements represent
+* Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
+* depth, height and width, respectively.
- */
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -688,13 +1008,69 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        stride_height, stride_width, padding_depth, padding_height,
        padding_width, pool_process, exclusive, adaptive, input_grad_data);
  }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, PoolProcess pool_process, bool exclusive,
+      bool adaptive, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, exclusive, adaptive, input_grad_data,
+        channel_last);  // add channel_last
+  }
 };
 /*
- * All tensors are in NCDHW format.
+* tensors are in NCDHW or NDHWC format.
- * Ksize, strides, paddings are three elements. These three elements represent
+* Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
+* depth, height and width, respectively.
- */
+* Paddings are six elements. These six elements represent depth_forth,
+* depth_back,
+* height_up, height_down, width_left and width_right, respectively.
+*/
 template <class T>
 class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
 public:
@@ -743,6 +1119,57 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
        stride_height, stride_width, padding_depth, padding_height,
        padding_width, input_grad_data);
  }
+  void operator()(
+      const platform::CUDADeviceContext& context,
+      const framework::Tensor& input, const framework::Tensor& output,
+      const framework::Tensor& output_grad, const std::vector<int>& ksize,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::string data_format, framework::Tensor* input_grad) {
+    bool channel_last = (data_format == "NDHWC");
+    const int batch_size = input.dims()[0];
+    const int input_channels = channel_last ? input.dims()[4] : input.dims()[1];
+    const int input_depth = channel_last ? input.dims()[1] : input.dims()[2];
+    const int input_height = channel_last ? input.dims()[2] : input.dims()[3];
+    const int input_width = channel_last ? input.dims()[3] : input.dims()[4];
+    const int output_channels =
+        channel_last ? output.dims()[4] : output.dims()[1];
+    const int output_depth = channel_last ? output.dims()[1] : output.dims()[2];
+    const int output_height =
+        channel_last ? output.dims()[2] : output.dims()[3];
+    const int output_width = channel_last ? output.dims()[3] : output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data, channel_last);  // add channel_last
+  }
 };
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;

--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -83,10 +84,11 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
 /*
 * \brief Getting pooling results, and calculating gradient.
 *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C
- * number of channels, H and W is the height and width of feature.
+ * is the number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C
- * number of channels, D, H and W is the depth, height and width of feature.
+ * is the number of channels, D, H and W is the depth, height and width of
+ * feature.
 *
 * In max pooling, it is possible that the pooling region has multiple maximum
 * elements. In this case, we should compute the gradient of the first maximum
@@ -115,6 +117,14 @@ class Pool2dFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
                  bool exclusive, bool adaptive, framework::Tensor* output);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -127,6 +137,15 @@ class Pool2dGradFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 template <typename DeviceContext, class T>
@@ -139,6 +158,14 @@ class MaxPool2dGradFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
                  framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, framework::Tensor* input_grad);
 };
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -149,6 +176,13 @@ class Pool3dFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
                  bool exclusive, bool adaptive, framework::Tensor* output);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -161,6 +195,15 @@ class Pool3dGradFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, PoolProcess pool_compute,
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 template <typename DeviceContext, class T>
@@ -173,6 +216,14 @@ class MaxPool3dGradFunctor {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
                  framework::Tensor* input_grad);
+  // overload operator() to support argument data_format
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string data_format, framework::Tensor* input_grad);
 };
 /*

--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -27,47 +29,117 @@ using PoolingMode = platform::PoolingMode;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+DataLayout getLayoutFromStr(std::string data_format) {
+  if (data_format == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (data_format == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (data_format == "NCDHW") {
+    return DataLayout::kNCDHW;
+  } else {
+    return DataLayout::kNCDHW;
+  }
+}
 template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
                      "It must use CUDAPlace.");
    const Tensor *input = ctx.Input<Tensor>("X");
    Tensor *output = ctx.Output<Tensor>("Out");
+    output->mutable_data<T>(ctx.GetPlace());
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>(ctx.GetPlace());
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("global_pooling")) {
+    std::string data_format = ctx.Attr<std::string>("data_format");
-      for (size_t i = 0; i < ksize.size(); ++i) {
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
-        paddings[i] = 0;
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+    // update paddings
+    auto in_x_dims = input->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
      }
    }
-    // ------------------- cudnn descriptors ---------------------
+    if (global_pooling) {
-    ScopedTensorDescriptor input_desc;
+      UpdateKsize(&ksize, data_dims);
-    ScopedTensorDescriptor output_desc;
+    }
-    ScopedPoolingDescriptor pool_desc;
+    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+    // -----------------transformed tensor ------------------------
+    Tensor transformed_input(input->type());
+    Tensor transformed_output(output->type());
    DataLayout layout;
-    if (strides.size() == 2U) {
+    if (data_format == str_NDHWC) {
-      layout = DataLayout::kNCHW;
-    } else {
      layout = DataLayout::kNCDHW;
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 4, 1, 2, 3};
+      // input
+      transformed_input.Resize(input->dims());
+      auto in_dims_vec = framework::vectorize(input->dims());
+      in_dims_vec[1] = input->dims()[4];
+      in_dims_vec[2] = input->dims()[1];
+      in_dims_vec[3] = input->dims()[2];
+      in_dims_vec[4] = input->dims()[3];
+      transformed_input.Resize(framework::make_ddim(in_dims_vec));
+      transformed_input.mutable_data(ctx.GetPlace(), input->type());
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      trans5(dev_ctx, *input, &transformed_input, axis);
+      // output
+      transformed_output.Resize(output->dims());
+      auto out_dims_vec = framework::vectorize(output->dims());
+      out_dims_vec[1] = output->dims()[4];
+      out_dims_vec[2] = output->dims()[1];
+      out_dims_vec[3] = output->dims()[2];
+      out_dims_vec[4] = output->dims()[3];
+      transformed_output.Resize(framework::make_ddim(out_dims_vec));
+    } else {
+      layout = getLayoutFromStr(data_format);
+      transformed_input = *input;
+      transformed_output = *output;
    }
+    const T *tranformed_input_data = transformed_input.data<T>();
+    T *tranformed_output_data = transformed_output.mutable_data<T>(
+        transformed_output.dims(), ctx.GetPlace());
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
+        layout, framework::vectorize<int>(transformed_input.dims()));
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
+        layout, framework::vectorize<int>(transformed_output.dims()));
    PoolingMode pooling_mode;
    if (pooling_type == "max") {
@@ -83,9 +155,19 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn pool algorithm ---------------------
    auto handle = ctx.cuda_device_context().cudnn_handle();
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
-        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
-        cudnn_output_desc, output_data));
+        tranformed_input_data, &beta, cudnn_output_desc,
+        tranformed_output_data));
+    // add
+    if (data_format == str_NDHWC) {
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 2, 3, 4, 1};
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      trans5_v2(dev_ctx, transformed_output, output, axis);
+    }
  }
 };
@@ -93,7 +175,7 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
                      "It must use CUDAPlace.");
    const Tensor *input = ctx.Input<Tensor>("X");
@@ -104,37 +186,109 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+    // update paddings
+    auto in_x_dims = input->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
+      }
+    }
-    if (ctx.Attr<bool>("global_pooling")) {
+    if (global_pooling) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
+      UpdateKsize(&ksize, data_dims);
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(input->dims()[i + 2]);
    }
+    // ------- tensor grad --------------
+    Tensor transformed_input(input->type());
+    Tensor transformed_output(output->type());
+    Tensor transformed_output_grad(output_grad->type());
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    Tensor transformed_input_grad(input_grad->type());
+    DataLayout layout;
+    const std::string str_NCHW = "NCHW", str_NHWC = "NHWC";
+    const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC";
+    if (data_format == str_NDHWC) {
+      layout = DataLayout::kNCDHW;
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      std::vector<int> axis{0, 4, 1, 2, 3};
+      // input
+      transformed_input.Resize(input->dims());
+      auto in_dims_vec = framework::vectorize(input->dims());
+      in_dims_vec[1] = input->dims()[4];
+      in_dims_vec[2] = input->dims()[1];
+      in_dims_vec[3] = input->dims()[2];
+      in_dims_vec[4] = input->dims()[3];
+      transformed_input.Resize(framework::make_ddim(in_dims_vec));
+      transformed_input.mutable_data(ctx.GetPlace(), input->type());
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      trans5(dev_ctx, *input, &transformed_input, axis);
+      // output
+      transformed_output.Resize(output->dims());
+      auto out_dims_vec = framework::vectorize(output->dims());
+      out_dims_vec[1] = output->dims()[4];
+      out_dims_vec[2] = output->dims()[1];
+      out_dims_vec[3] = output->dims()[2];
+      out_dims_vec[4] = output->dims()[3];
+      transformed_output.Resize(framework::make_ddim(out_dims_vec));
+      transformed_output.mutable_data(ctx.GetPlace(), output->type());
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      trans5_v2(dev_ctx, *output, &transformed_output, axis);
+      // output grad
+      transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
+      transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
+      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v3;
+      trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
+      // input grad
+      transformed_input_grad.Resize(framework::make_ddim(in_dims_vec));
+    } else {
+      layout = getLayoutFromStr(data_format);
+      transformed_input = *input;
+      transformed_output = *output;
+      transformed_output_grad = *output_grad;
+      transformed_input_grad = *input_grad;
    }
-    const T *input_data = input->data<T>();
+    const T *input_data = transformed_input.data<T>();
-    const T *output_data = output->data<T>();
+    const T *output_data = transformed_output.data<T>();
-    const T *output_grad_data = output_grad->data<T>();
+    const T *output_grad_data = transformed_output_grad.data<T>();
    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
    ScopedTensorDescriptor output_desc;
    ScopedPoolingDescriptor pool_desc;
-    DataLayout layout;
-    if (strides.size() == 2U) {
-      layout = DataLayout::kNCHW;
-    } else {
-      layout = DataLayout::kNCDHW;
-    }
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize<int>(input->dims()));
+        layout, framework::vectorize<int>(transformed_input.dims()));
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize<int>(output->dims()));
+        layout, framework::vectorize<int>(transformed_output.dims()));
    PoolingMode pooling_mode;
    if (pooling_type == "max") {
@@ -155,13 +309,21 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
    auto handle = ctx.cuda_device_context().cudnn_handle();
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
-      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      T *input_grad_data = transformed_input_grad.mutable_data<T>(
+          transformed_input_grad.dims(), ctx.GetPlace());
      // Because beta is zero, it is unnecessary to reset input_grad.
      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
          &beta, cudnn_input_desc, input_grad_data));
+      if (data_format == str_NDHWC) {
+        auto &dev_ctx =
+            ctx.template device_context<paddle::platform::CUDADeviceContext>();
+        std::vector<int> axis{0, 2, 3, 4, 1};
+        math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v4;
+        trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
+      }
    }
  }
 };

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -24,68 +24,93 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
+int PoolOutputSize(int input_size, int filter_size, int padding_1,
-                   bool ceil_mode) {
+                   int padding_2, int stride, bool ceil_mode) {
  int output_size;
  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + padding_1 + padding_2) / stride + 1;
  } else {
    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + padding_1 + padding_2 + stride - 1) /
+            stride +
+        1;
  }
-  PADDLE_ENFORCE(output_size > 0,
+  PADDLE_ENFORCE_GT(
-                 "Due to the settings of padding(%d), filter_size(%d) and "
+      output_size, 0,
+      "Due to the settings of padding(%d,%d), filter_size(%d) and "
      "stride(%d), the output size is less than 0, please check "
      "again. Input_size:%d",
-                 padding, filter_size, stride, input_size);
+      padding_1, padding_2, filter_size, stride, input_size);
  return output_size;
 }
 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                    "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
                    "Out(Output) of Pooling should not be null.");
-  auto in_x_dims = ctx->GetInputDim("X");
  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
+  bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
+  std::string data_format = ctx->Attrs().Get<std::string>("data_format");
+  std::string padding_algorithm =
+      ctx->Attrs().Get<std::string>("padding_algorithm");
-  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+  auto in_x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
                    "Pooling intput should be 4-D or 5-D tensor.");
-  if (ctx->Attrs().Get<bool>("global_pooling")) {
+  PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
-  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
                    "Input size and pooling size should be consistent.");
  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
                    "Strides size and pooling size should be the same.");
-  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
-                    "Paddings size and pooling size should be the same.");
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  // update paddings if "SAME" or global_pooling
+  framework::DDim data_dims;
+  if (channel_last) {
+    data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+  } else {
+    data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+  }
+  UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                data_dims, strides, ksize);
+  if (global_pooling) {
+    UpdateKsize(&ksize, data_dims);
+  }
+  std::vector<int64_t> output_shape;
  if (adaptive) {
    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
  } else {
-    for (size_t i = 0; i < ksize.size(); ++i) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
-      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
+      if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
-        output_shape.push_back(-1);
+        output_shape.push_back(in_x_dims[i]);
      } else {
-        output_shape.push_back(PoolOutputSize(
+        output_shape.push_back(
-            in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
+            PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
+                           paddings[2 * i + 1], strides[i], ceil_mode));
      }
    }
  }
+  // output_N = input_N
+  output_shape.insert(output_shape.begin(), in_x_dims[0]);
+  // output_C = input_C
+  if (channel_last) {
+    output_shape.push_back(in_x_dims[in_x_dims.size() - 1]);
+  } else {
+    output_shape.insert(output_shape.begin() + 1, in_x_dims[1]);
+  }
  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  ctx->ShareLoD("X", "Out");
 }
@@ -93,7 +118,9 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
+  // std::string data_format = ctx.Attr<std::string>("data_format");  // change:
+  // delete
+  std::string data_format = "AnyLayout";
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
@@ -114,8 +141,8 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
 }
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
                    "Input(X@GRAD) should not be null.");
  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
@@ -123,7 +150,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library_{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
+  // std::string data_format = ctx.Attr<std::string>("data_format");  //
+  // change:delete
+  std::string data_format = "AnyLayout";
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
@@ -186,8 +215,8 @@ void Pool2dOpMaker::Make() {
  // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height_top, height_bottom, "
-      "operator."
+      "width_left, wifth_right) of pooling operator."
      "If global_pooling = true, paddings and kernel size will be ignored.")
      .SetDefault({0, 0});
  AddAttr<bool>(
@@ -206,7 +235,7 @@ void Pool2dOpMaker::Make() {
  AddAttr<bool>(
      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      "(bool, default false) Only used in cudnn kernel, need install cudnn.")
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
@@ -215,7 +244,7 @@ void Pool2dOpMaker::Make() {
      "the floor function will be used.")
      .SetDefault(false);
  AddAttr<bool>("use_mkldnn",
-                "(bool, default false) Only used in mkldnn kernel")
+                "(bool, default false) Only used in mkldnn kernel.")
      .SetDefault(false);
  AddAttr<bool>("use_quantizer",
                "(bool, default false) "
@@ -229,18 +258,24 @@ void Pool2dOpMaker::Make() {
      "An optional string from: \"NHWC\", \"NCHW\". "
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
+      .SetDefault("NCHW");
  AddAttr<bool>("is_test",
                "(bool, default false) Set to true for inference only, false "
                "for training. Some layers may run faster when this is true.")
      .SetDefault(false);
+  AddAttr<std::string>(
+      "padding_algorithm",
+      "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+      "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+      "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+      .SetDefault("EXPLICIT");
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+Input(X) and output(Out) are in NCHW or NHWC format, where N is batch size, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
@@ -256,30 +291,47 @@ Example:
       Out shape: $(N, C, H_{out}, W_{out})$
+  For pool_padding = "SAME":
+       $$
+       H_{out} = \\frac{(H_{in} + strides[0] - 1)}{strides[0]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} + strides[1] - 1)}{strides[1]}
+       $$
+  For pool_padding = "VALID":
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[0] + strides[0])}{strides[0]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + strides[1])}{strides[1]}
+       $$
  For ceil_mode = false:
       $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + pad_height_top + pad_height_bottom}{strides[0]} + 1
       $$
       $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + pad_width_left + pad_width_right}{strides[1]} + 1
       $$
  For ceil_mode = true:
       $$
-       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + pad_height_top + pad_height_bottom + strides[0] - 1)}{strides[0]} + 1
       $$
       $$
-       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + pad_width_left + pad_width_right + strides[1] - 1)}{strides[1]} + 1
       $$
  For exclusive = false:
       $$
-       hstart = i * strides[0] - paddings[0]
+       hstart = i * strides[0] - pad_height_top
       $$
       $$
       hend = hstart + ksize[0]
       $$
       $$
-       wstart = j * strides[1] - paddings[1]
+       wstart = j * strides[1] - pad_width_left
       $$
       $$
       wend = wstart + ksize[1]
@@ -290,13 +342,13 @@ Example:
  For exclusive = true:
       $$
-       hstart = max(0, i * strides[0] - paddings[0])
+       hstart = max(0, i * strides[0] - pad_height_top)
       $$
       $$
       hend = min(H, hstart + ksize[0])
       $$
       $$
-       wstart = max(0, j * strides[1] - paddings[1])
+       wstart = max(0, j * strides[1] - pad_width_left)
       $$
       $$
       wend = min(W, wstart + ksize[1])
@@ -319,13 +371,14 @@ class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
 void Pool3dOpMaker::Make() {
  AddInput("X",
           "(Tensor) The input tensor of pooling operator. "
-           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "The format of input tensor is NCDHW or NDHWC, where N is batch "
+           "size, C is "
           "the number of channels, and D, H and W is the depth, height and "
           "width of "
           "the feature, respectively.");
  AddOutput("Out",
            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW, "
+            "The format of output tensor is also NCDHW or NDHWC, "
            "where N is batch size, C is "
            "the number of channels, and D, H and W is the depth, height and "
            "width of the feature, respectively.");
@@ -355,8 +408,10 @@ void Pool3dOpMaker::Make() {
                               // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector<int>, default {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(pad_depth_front, "
-      "width) of pooling operator. "
+      "pad_depth_back, "
+      "pad_height_top, pad_height_bottom, pad_width_left, pad_width_right"
+      ") of pooling operator. "
      "If global_pooling = true, ksize and paddings will be ignored.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
@@ -376,7 +431,7 @@ void Pool3dOpMaker::Make() {
  AddAttr<bool>(
      "use_cudnn",
-      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      "(bool, default false) Only used in cudnn kernel, need install cudnn.")
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
@@ -389,11 +444,17 @@ void Pool3dOpMaker::Make() {
      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
-      "(string, default NCHW) Only used in "
+      "(string, default NCDHW) Only used in "
-      "An optional string from: \"NHWC\", \"NCHW\". "
+      "An optional string from: \"NDHWC\", \"NCDHW\". "
-      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "Defaults to \"NDHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
-      .SetDefault("AnyLayout");
+      .SetDefault("NCDHW");
+  AddAttr<std::string>(
+      "padding_algorithm",
+      "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+      "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+      "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+      .SetDefault("EXPLICIT");
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(
@@ -401,7 +462,7 @@ Pool3d Operator.
 The pooling3d operation calculates the output based on
 the input, pooling_type, ksize, strides, and paddings parameters.
-Input(X) and output(Out) are in NCDHW format, where N is batch
+Input(X) and output(Out) are in NCDHW or NDHWC format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
 width of the feature, respectively. Parameters(ksize, strides, paddings)
 are three elements. These three elements represent depth, height and
@@ -412,42 +473,65 @@ Example:
       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  For pool_padding = "SAME":
+       $$
+       D_{out} = \\frac{(D_{in} + strides[0] - 1)}{strides[0]}
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} + strides[1] - 1)}{strides[1]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} + strides[2] - 1)}{strides[2]}
+       $$
+  For pool_padding = "VALID":
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + strides[0])}{strides[0]}
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + strides[1])}{strides[1]}
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + strides[2])}{strides[2]}
+       $$
  For ceil_mode = false:
       $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + pad_depth_front + pad_depth_back)}{strides[0]} + 1
       $$
       $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[1] + pad_height_top + pad_height_bottom)}{strides[1]} + 1
       $$
       $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[2] + pad_width_left + pad_width_right)}{strides[2]} + 1
       $$
  For ceil_mode = true:
       $$
-       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + pad_depth_front + pad_depth_back + strides[0] -1)}{strides[0]} + 1
       $$
       $$
-       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[1] + pad_height_top + pad_height_bottom + strides[1] -1)}{strides[1]} + 1
       $$
       $$
-       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[2] + pad_width_left + pad_width_right + strides[2] -1)}{strides[2]} + 1
       $$
  For exclusive = false:
       $$
-       dstart = i * strides[0] - paddings[0]
+       dstart = i * strides[0] - pad_depth_front
       $$
       $$
       dend = dstart + ksize[0]
       $$
       $$
-       hstart = j * strides[1] - paddings[1]
+       hstart = j * strides[1] - pad_height_top
       $$
       $$
       hend = hstart + ksize[1]
       $$
       $$
-       wstart = k * strides[2] - paddings[2]
+       wstart = k * strides[2] -  pad_width_left
       $$
       $$
       wend = wstart + ksize[2]
@@ -458,16 +542,19 @@ Example:
  For exclusive = true:
       $$
-       dstart = max(0, i * strides[0] - paddings[0])
+       dstart = max(0, i * strides[0] - pad_depth_front)
       $$
       $$
       dend = min(D, dstart + ksize[0])
       $$
       $$
+       hstart = max(0, j * strides[1] - pad_height_top)
+       $$
+       $$
       hend = min(H, hstart + ksize[1])
       $$
       $$
-       wstart = max(0, k * strides[2] - paddings[2])
+       wstart = max(0, k * strides[2] - pad_width_left)
       $$
       $$
       wend = min(W, wstart + ksize[2])

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,13 +14,13 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 namespace paddle {
 namespace operators {
@@ -57,6 +57,57 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
 };
+inline void UpdatePadding(std::vector<int>* paddings, const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const framework::DDim data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = framework::vectorize<int>(data_dims);
+  if (paddings->size() == data_dims.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2, paddings->size(),
+        "Paddings size should be the same or twice as the pooling size.");
+  }
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int out_size = (data_dims[i] + strides[i] - 1) / strides[0];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], 0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+inline void UpdateKsize(std::vector<int>* ksize,
+                        const framework::DDim data_dims) {
+  ksize->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < ksize->size(); ++i) {
+    *(ksize->begin() + i) = static_cast<int>(data_dims[i]);
+  }
+}
 template <typename DeviceContext, typename T>
 class PoolKernel : public framework::OpKernel<T> {
@@ -69,14 +120,36 @@ class PoolKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::string data_format = context.Attr<std::string>("data_format");
    bool exclusive = context.Attr<bool>("exclusive");
    bool adaptive = context.Attr<bool>("adaptive");
-    if (context.Attr<bool>("global_pooling")) {
+    bool global_pooling = context.Attr<bool>("global_pooling");
-      for (size_t i = 0; i < ksize.size(); ++i) {
+    std::string padding_algorithm =
-        paddings[i] = 0;
+        context.Attr<std::string>("padding_algorithm");
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+    // update paddings
+    auto in_x_dims = in_x->dims();
+    framework::DDim data_dims;
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
      }
    }
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
    auto& dev_ctx = context.template device_context<DeviceContext>();
    switch (ksize.size()) {
      case 2: {
@@ -85,16 +158,16 @@ class PoolKernel : public framework::OpKernel<T> {
              DeviceContext, paddle::operators::math::MaxPool<T>, T>
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out);
+                         pool_process, true, false, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
              DeviceContext, paddle::operators::math::AvgPool<T>, T>
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         exclusive, adaptive, out);
+                         pool_process, exclusive, adaptive, out);
        }
      } break;
      case 3: {
@@ -103,15 +176,16 @@ class PoolKernel : public framework::OpKernel<T> {
              DeviceContext, paddle::operators::math::MaxPool<T>, T>
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         true, false, out);
+                         pool_process, true, false, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              DeviceContext, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
-                         exclusive, adaptive, out);
+                         pool_process, exclusive, adaptive, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -135,13 +209,33 @@ class PoolGradKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    bool exclusive = context.Attr<bool>("exclusive");
    bool adaptive = context.Attr<bool>("adaptive");
+    std::string data_format = context.Attr<std::string>("data_format");
+    bool global_pooling = context.Attr<bool>("global_pooling");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    if (context.Attr<bool>("global_pooling")) {
+    // update paddings
-      for (size_t i = 0; i < ksize.size(); ++i) {
+    auto in_x_dims = in_x->dims();
-        paddings[i] = 0;
+    framework::DDim data_dims;
-        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (data_dims.size() * 2 == paddings.size()) {
+      for (size_t i = 0; i < data_dims.size(); ++i) {
+        paddings.erase(paddings.begin() + i + 1);
      }
    }
+    if (global_pooling) {
+      UpdateKsize(&ksize, data_dims);
+    }
    auto& dev_ctx = context.template device_context<DeviceContext>();
    if (in_x_grad) {
      in_x_grad->mutable_data<T>(context.GetPlace());
@@ -154,15 +248,15 @@ class PoolGradKernel : public framework::OpKernel<T> {
            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
                pool2d_backward;
            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
+                            paddings, data_format, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool2dGradFunctor<
                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
+                            paddings, data_format, pool_process, exclusive,
-                            in_x_grad);
+                            adaptive, in_x_grad);
          }
        } break;
        case 3: {
@@ -170,15 +264,15 @@ class PoolGradKernel : public framework::OpKernel<T> {
            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
                pool3d_backward;
            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, in_x_grad);
+                            paddings, data_format, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool3dGradFunctor<
                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, adaptive,
+                            paddings, data_format, pool_process, exclusive,
-                            in_x_grad);
+                            adaptive, in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -72,6 +72,7 @@ enum class DataLayout {  // Not use
  kNHWC,
  kNCHW,
  kNCDHW,
+  kNDHWC,  // add, liyamei
  kNCHW_VECT_C,
 };
@@ -212,6 +213,8 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
      return CUDNN_TENSOR_NCHW;
    case DataLayout::kNCDHW:
      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+    case DataLayout::kNDHWC:
+      return CUDNN_TENSOR_NHWC;  // add, liyamei
    default:
      PADDLE_THROW("Unknown cudnn equivalent for order");
  }
@@ -238,14 +241,31 @@ class ScopedTensorDescriptor {
      strides[i] = dims[i + 1] * strides[i + 1];
    }
    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Assume using NCHW or NCDHW order
+    // NOTE: Here, Assume using NCHW or NCDHW order
-    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
    if (groups > 1) {
      dims_with_group[1] = dims_with_group[1] / groups;
    }
+    if (dims.size() == 4) {
+      if (format == CUDNN_TENSOR_NCHW) {
        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
            desc_, type, dims_with_group.size(), dims_with_group.data(),
            strides.data()));
+      } else {  // CUDNN_TENSOR_NHWC
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
+            desc_, format, type, dims[0], dims[3], dims[1], dims[2]));
+      }
+    } else if (dims.size() == 5) {
+      if (format == CUDNN_TENSOR_NCHW) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+            desc_, type, dims_with_group.size(), dims_with_group.data(),
+            strides.data()));
+      } else {  // CUDNN_TENSOR_NHWC
+        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+            desc_, format, type, dims.size(), dims.data()));
+      }
+    }
    return desc_;
  }

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -126,7 +126,8 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnRNNBackwardWeights);                       \
  __macro(cudnnRNNForwardInference);                      \
  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);
+  __macro(cudnnDestroyRNNDescriptor);                     \
+  __macro(cudnnSetTensorNdDescriptorEx);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2871,15 +2871,16 @@ def pool2d(input,
           use_cudnn=True,
           ceil_mode=False,
           name=None,
-           exclusive=True):
+           exclusive=True,
+           data_format="NCHW"):
    """
    ${comment}
    Args:
        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
+                          input tensor is `"NCHW"` or `"NHWC"`, where `N` is batch size, `C` is
-                          the number of channels, H is the height of the
+                          the number of channels, `H` is the height of the
-                          feature, and W is the width of the feature.
+                          feature, and `W` is the width of the feature.
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two integers, (pool_size_Height, pool_size_Width).
            Otherwise, the pool kernel size will be a square of an int.
@@ -2887,8 +2888,13 @@ def pool2d(input,
        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
            it must contain two integers, (pool_stride_Height, pool_stride_Width).
            Otherwise, the pool stride size will be a square of an int.
-        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+        pool_padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
-            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Otherwise, the pool padding size will be a square of an int.
        global_pooling (bool): ${global_pooling_comment}
        use_cudnn (bool): ${use_cudnn_comment}
@@ -2896,55 +2902,125 @@ def pool2d(input,
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.
        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
+                          mode, default is `true`.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                `[batch_size, input_channels, input_height, input_width]`.
    Returns:
        Variable: The pooling result.
    Raises:
-        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If `pool_type` is not "max" nor "avg"
-        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If `global_pooling` is False and `pool_size` is -1
-        ValueError: If 'use_cudnn' is not a bool value.
+        ValueError: If `use_cudnn` is not a bool value.
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
+              name='data', shape=[10, 3, 32, 32], append_batch_size=False, dtype='float32')
-          pool2d = fluid.layers.pool2d(
-                            input=data,
+          # example 1:
-                            pool_size=2,
+          # Attr(pool_padding) is a list with 4 elements, Attr(data_format) is "NCHW".
-                            pool_type='max',
+          out_1 = fluid.layers.pool2d(
-                            pool_stride=1,
+            input = data,
-                            global_pooling=False)
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = [1, 2, 1, 0],
+            data_format = "NCHW")
+          # example 2:
+          # Attr(pool_padding) is a string, Attr(data_format) is "NCHW".
+          out_2 = fluid.layers.pool2d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = "VALID",
+            data_format = "NCHW")
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
            str(pool_type))
    if global_pooling is False and pool_size == -1:
        raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
+            "and be a valid value. Received pool_size: %s." % str(pool_size))
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): %s." % str(use_cudnn))
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
-    if not isinstance(use_cudnn, bool):
+    def update_padding(padding, data_format):
-        raise ValueError("use_cudnn should be True or False")
+        def is_list_or_tuple(ele):
+            if isinstance(ele, list) or isinstance(ele, tuple):
+                return True
+            return False
-    l_type = 'pool2d'
+        if is_list_or_tuple(padding) and len(padding) == 4:
+            if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:4]
+                padding = [ele for a_list in padding for ele in a_list]
+            elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+                if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[1:3]
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 4, 'padding')
-    helper = LayerHelper(l_type, **locals())
+        else:
+            padding = utils.convert_to_list(padding, 2, 'padding')
+        return padding
+    padding_algorithm = "EXPLICIT"
+    if isinstance(pool_padding, str):
+        pool_padding = pool_padding.upper()
+        if pool_padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(pool_padding))
+        if pool_padding == "VALID":
+            padding_algorithm = "VALID"
+            pool_padding = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif pool_padding == "SAME":
+            padding_algorithm = "SAME"
+            pool_padding = [0, 0, 0, 0]
+    pool_padding = update_padding(pool_padding, data_format)
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
    helper.append_op(
-        type=l_type,
+        type=op_type,
        inputs={"X": input},
        outputs={"Out": pool_out},
        attrs={
@@ -2953,10 +3029,12 @@ def pool2d(input,
            "global_pooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
            "use_cudnn": use_cudnn,
            "ceil_mode": ceil_mode,
            "use_mkldnn": False,
            "exclusive": exclusive,
+            "data_format": data_format,
        })
    return pool_out
@@ -2972,30 +3050,43 @@ def pool3d(input,
           use_cudnn=True,
           ceil_mode=False,
           name=None,
-           exclusive=True):
+           exclusive=True,
+           data_format="NCDHW"):
    """
    ${comment}
    Args:
        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCDHW, where N is batch size, C is
+                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
-                          the number of channels, D is the depth of the feature,
+                          the number of channels, `D` is the depth of the feature,
-                          H is the height of the feature, and W is the width
+                          `H` is the height of the feature, and `W` is the width
                          of the feature.
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
            is a tuple or list, it must contain three integers, 
            (pool_size_Depth, pool_size_Height, pool_size_Width).
            Otherwise, the pool kernel size will be the cube of an int.
        pool_type (string): ${pooling_type_comment}
-        pool_stride (int): stride of the pooling layer.
+        pool_stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
-        pool_padding (int): padding size.
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
        global_pooling (bool): ${global_pooling_comment}
        use_cudnn (bool): ${use_cudnn_comment}
        ceil_mode (bool): ${ceil_mode_comment}
        name (str): A name for this layer(optional). If set None, the layer
            will be named automatically.
        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
+                          mode, default is true.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                `[batch_size, input_channels, input_depth, input_height, input_width]`.
    Returns:
        Variable: output of pool3d layer.
@@ -3005,39 +3096,114 @@ def pool3d(input,
        .. code-block:: python
          import paddle.fluid as fluid
          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32, 32], dtype='float32')
+              name='data', shape=[10, 3, 32, 32, 32], append_batch_size=False, dtype='float32')
-          pool3d = fluid.layers.pool3d(
-                            input=data,
+          # example 1:
-                            pool_size=2,
+          # Attr(pool_padding) is a list with 6 elements, Attr(data_format) is "NCDHW".
-                            pool_type='max',
+          out_1 = fluid.layers.pool3d(
-                            pool_stride=1,
+            input = data,
-                            global_pooling=False)
+            pool_size = 2,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = [1, 2, 1, 0, 1, 2],
+            global_pooling = False,
+            data_format = "NCDHW")
+          # example 2:
+          # Attr(pool_padding) is a string, Attr(data_format) is "NCDHW".
+          out_2 = fluid.layers.pool3d(
+            input = data,
+            pool_size = 3,
+            pool_type = "avg",
+            pool_stride = 1,
+            pool_padding = "VALID",
+            global_pooling = False,
+            data_format = "NCDHW")
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
            str(pool_type))
    if global_pooling is False and pool_size == -1:
        raise ValueError(
-            "When the global_pooling is False, pool_size must be passed "
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received pool_size: " + str(pool_size))
+            "and be a valid value. Received Attr(pool_size): %s." %
+            str(pool_size))
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
    pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
-    if not isinstance(use_cudnn, bool):
+    def update_padding(padding, data_format):
-        raise ValueError("use_cudnn should be True or False")
+        def is_list_or_tuple(ele):
+            if isinstance(ele, (list, tuple)):
+                return True
+            return False
-    l_type = "pool3d"
+        if is_list_or_tuple(padding) and len(padding) == 5:
-    helper = LayerHelper(l_type, **locals())
+            if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:5]
+                padding = [ele for a_list in padding for ele in a_list]
+            elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
+                if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[1:4]
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 6, 'padding')
+        elif is_list_or_tuple(padding) and len(padding) == 6:
+            padding = utils.convert_to_list(padding, 6, 'padding')
+        else:
+            padding = utils.convert_to_list(padding, 3, 'padding')
+        return padding
+    padding_algorithm = "EXPLICIT"
+    if isinstance(pool_padding, str):
+        pool_padding = pool_padding.upper()
+        if pool_padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(pool_padding))
+        if pool_padding == "VALID":
+            padding_algorithm = "VALID"
+            pool_padding = [0, 0, 0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif pool_padding == "SAME":
+            padding_algorithm = "SAME"
+            pool_padding = [0, 0, 0, 0, 0, 0]
+    pool_padding = update_padding(pool_padding, data_format)
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
    helper.append_op(
-        type=l_type,
+        type=op_type,
        inputs={"X": input},
        outputs={"Out": pool_out},
        attrs={
@@ -3046,10 +3212,12 @@ def pool3d(input,
            "global_pooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
            "use_cudnn": use_cudnn,
            "ceil_mode": ceil_mode,
            "use_mkldnn": False,
            "exclusive": exclusive,
+            "data_format": data_format,
        })
    return pool_out

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle.fluid as fluid
 def adaptive_start_index(index, input_size, output_size):
@@ -118,26 +119,139 @@ def avg_pool2D_forward_naive(x,
    return out
+def pool2D_forward_naive(x,
+                         ksize,
+                         strides,
+                         paddings,
+                         global_pool=0,
+                         ceil_mode=False,
+                         exclusive=True,
+                         adaptive=False,
+                         data_format='NCHW',
+                         pool_type="max",
+                         padding_algorithm="EXPLICIT"):
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCHW":
+                input_data_shape = x.shape[2:4]
+            elif data_format == "NHWC":
+                input_data_shape = x.shape[1:3]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+    assert len(paddings) == 2 or len(paddings) == 4
+    is_sys = True if len(paddings) == 2 else False
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+    if global_pool == 1:
+        ksize = [H, W]
+        paddings = [0 for _ in range(len(paddings))]
+    pad_h_up = paddings[0] if is_sys else paddings[0]
+    pad_h_down = paddings[0] if is_sys else paddings[1]
+    pad_w_left = paddings[1] if is_sys else paddings[2]
+    pad_w_right = paddings[1] if is_sys else paddings[3]
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1
+        W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+    for i in range(H_out):
+        if adaptive:
+            in_h_start = adaptive_start_index(i, H, ksize[0])
+            in_h_end = adaptive_end_index(i, H, ksize[0])
+        else:
+            in_h_start = np.max((i * strides[0] - pad_h_up, 0))
+            in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H))
+        for j in range(W_out):
+            if adaptive:
+                in_w_start = adaptive_start_index(j, W, ksize[1])
+                in_w_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                in_w_start = np.max((j * strides[1] - pad_w_left, 0))
+                in_w_end = np.min((j * strides[1] + ksize[1] - pad_w_left, W))
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
 class TestPool2D_Op(OpTest):
    def setUp(self):
        self.op_type = "pool2d"
        self.use_cudnn = False
+        self.init_kernel_type()
        self.use_mkldnn = False
        self.init_data_type()
        self.init_test_case()
+        self.padding_algorithm = "EXPLICIT"
+        self.init_paddings()
        self.init_global_pool()
        self.init_kernel_type()
        self.init_pool_type()
        self.init_ceil_mode()
        self.init_exclusive()
        self.init_adaptive()
-        if self.global_pool:
+        self.init_data_format()
-            self.paddings = [0 for _ in range(len(self.paddings))]
+        self.init_shape()
        input = np.random.random(self.shape).astype(self.dtype)
-        output = (self.pool2D_forward_naive(
+        output = pool2D_forward_naive(
            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive,
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
-            self.dtype)).astype(self.dtype)
+            self.pool_type, self.padding_algorithm).astype(self.dtype)
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
        self.attrs = {
@@ -149,10 +263,10 @@ class TestPool2D_Op(OpTest):
            'use_cudnn': self.use_cudnn,
            'use_mkldnn': self.use_mkldnn,
            'ceil_mode': self.ceil_mode,
-            'data_format':
+            'data_format': self.data_format,
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
            'exclusive': self.exclusive,
-            'adaptive': self.adaptive
+            'adaptive': self.adaptive,
+            "padding_algorithm": self.padding_algorithm,
        }
        self.outputs = {'Out': output}
@@ -177,14 +291,22 @@ class TestPool2D_Op(OpTest):
        elif self.pool_type != "max":
            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
-    def init_test_case(self):
+    def init_data_format(self):
+        self.data_format = "NCHW"
+    def init_shape(self):
        self.shape = [2, 3, 5, 5]
+    def init_test_case(self):
        self.ksize = [3, 3]
        self.strides = [1, 1]
+    def init_paddings(self):
        self.paddings = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
    def init_kernel_type(self):
-        pass
+        self.use_cudnn = False
    def init_data_type(self):
        self.dtype = np.float32
@@ -208,9 +330,10 @@ class TestPool2D_Op(OpTest):
 class TestCase1(TestPool2D_Op):
    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
+    def init_paddings(self):
        self.paddings = [0, 0]
    def init_pool_type(self):
@@ -220,12 +343,16 @@ class TestCase1(TestPool2D_Op):
    def init_global_pool(self):
        self.global_pool = False
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
 class TestCase2(TestPool2D_Op):
    def init_test_case(self):
-        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
+    def init_paddings(self):
        self.paddings = [1, 1]
    def init_pool_type(self):
@@ -235,6 +362,9 @@ class TestCase2(TestPool2D_Op):
    def init_global_pool(self):
        self.global_pool = False
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
 class TestCase3(TestPool2D_Op):
    def init_pool_type(self):
@@ -366,5 +496,715 @@ class TestAvgPoolAdaptive(TestCase1):
        self.adaptive = True
+#-------test pool2d with asymmetric padding-----
+class TestPool2D_AsyPadding(TestPool2D_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+class TestCase1_AsyPadding(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+class TestCase2_AsyPadding(TestCase2):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+class TestCase3_AsyPadding(TestCase3):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+class TestCase4_AsyPadding(TestCase4):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+class TestCase5_AsyPadding((TestCase5)):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [2, 2, 1, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+create_test_cudnn_class(TestPool2D_AsyPadding)
+create_test_cudnn_class(TestCase1_AsyPadding)
+create_test_cudnn_class(TestCase2_AsyPadding)
+create_test_cudnn_class(TestCase3_AsyPadding)
+create_test_cudnn_class(TestCase4_AsyPadding)
+create_test_cudnn_class(TestCase5_AsyPadding)
+create_test_cudnn_fp16_class(TestPool2D_AsyPadding)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding)
+create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase2_AsyPadding)
+class TestAvgInclude_AsyPadding(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+class TestCUDNNAvgInclude_AsyPadding(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+    def init_exclusive(self):
+        self.exclusive = False
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [2, 1, 1, 1]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1, 0, 2]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+#----------- test channel_last --------------
+class TestPool2D_channel_last(TestPool2D_Op):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+class TestCase1_channel_last(TestCase1):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase2_channel_last(TestCase2):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase3_channel_last(TestCase3):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+class TestCase4_channel_last(TestCase4):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase5_channel_last(TestCase5):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+create_test_cudnn_class(TestPool2D_channel_last)
+create_test_cudnn_class(TestCase1_channel_last)
+create_test_cudnn_class(TestCase2_channel_last)
+create_test_cudnn_class(TestCase3_channel_last)
+create_test_cudnn_class(TestCase4_channel_last)
+create_test_cudnn_class(TestCase5_channel_last)
+create_test_cudnn_fp16_class(TestPool2D_channel_last)
+create_test_cudnn_fp16_class(TestCase1_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_channel_last)
+create_test_cudnn_fp16_class(TestCase3_channel_last)
+create_test_cudnn_fp16_class(TestCase4_channel_last)
+create_test_cudnn_fp16_class(TestCase5_channel_last)
+create_test_cudnn_use_ceil_class(TestPool2D_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase2_channel_last)
+class TestCase5_Max(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        if self.has_cudnn() and self.pool_type == "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=1.00)
+        elif self.pool_type == "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
+class TestCase5_channel_last_Max(TestCase5_Max):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+create_test_cudnn_class(TestCase5_Max)
+create_test_cudnn_class(TestCase5_channel_last_Max)
+class TestAvgInclude_channel_last(TestCase2_channel_last):
+    def init_exclusive(self):
+        self.exclusive = False
+class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+    def init_exclusive(self):
+        self.exclusive = False
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+    def init_adaptive(self):
+        self.adaptive = True
+class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+create_test_cudnn_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase1_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(
+    TestCase1_AsyPadding_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
+class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestCUDNNAvgInclude_AsyPadding_channel_last(
+        TestCUDNNAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+class TestAvgPoolAdaptive_AsyPadding_channel_last(
+        TestAvgPoolAdaptive_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+# test paddings: SAME VALID
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.paddings = [0, 0]
+            self.padding_algorithm = "SAME"
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+create_test_padding_SAME_class(TestPool2D_Op)
+create_test_padding_SAME_class(TestCase1)
+create_test_padding_SAME_class(TestCase2)
+create_test_padding_SAME_class(TestCase3)
+create_test_padding_SAME_class(TestCase4)
+create_test_padding_SAME_class(TestCase5)
+create_test_padding_SAME_class(TestPool2D_channel_last)
+create_test_padding_SAME_class(TestCase1_channel_last)
+create_test_padding_SAME_class(TestCase2_channel_last)
+create_test_padding_SAME_class(TestCase3_channel_last)
+create_test_padding_SAME_class(TestCase4_channel_last)
+create_test_padding_SAME_class(TestCase5_channel_last)
+def create_test_cudnn_padding_SAME_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingSMAECase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "SAME"
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingSAMEOp")
+    TestCUDNNPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingSMAECase
+create_test_cudnn_padding_SAME_class(TestPool2D_Op)
+create_test_cudnn_padding_SAME_class(TestCase1)
+create_test_cudnn_padding_SAME_class(TestCase2)
+create_test_cudnn_padding_SAME_class(TestCase3)
+create_test_cudnn_padding_SAME_class(TestCase4)
+create_test_cudnn_padding_SAME_class(TestCase5)
+create_test_cudnn_padding_SAME_class(TestPool2D_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase1_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase2_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase3_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase4_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase5_channel_last)
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+create_test_padding_VALID_class(TestPool2D_Op)
+create_test_padding_VALID_class(TestCase1)
+create_test_padding_VALID_class(TestCase2)
+create_test_padding_VALID_class(TestCase3)
+create_test_padding_VALID_class(TestCase4)
+create_test_padding_VALID_class(TestCase5)
+create_test_padding_VALID_class(TestPool2D_channel_last)
+create_test_padding_VALID_class(TestCase1_channel_last)
+create_test_padding_VALID_class(TestCase2_channel_last)
+create_test_padding_VALID_class(TestCase3_channel_last)
+create_test_padding_VALID_class(TestCase4_channel_last)
+create_test_padding_VALID_class(TestCase5_channel_last)
+def create_test_cudnn_padding_VALID_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingVALIDCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingVALIDOp")
+    TestCUDNNPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingVALIDCase
+create_test_cudnn_padding_VALID_class(TestPool2D_Op)
+create_test_cudnn_padding_VALID_class(TestCase1)
+create_test_cudnn_padding_VALID_class(TestCase2)
+create_test_cudnn_padding_VALID_class(TestCase3)
+create_test_cudnn_padding_VALID_class(TestCase4)
+create_test_cudnn_padding_VALID_class(TestCase5)
+create_test_cudnn_padding_VALID_class(TestPool2D_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase1_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase2_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase3_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase4_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase5_channel_last)
+# ----- test API
+class TestPool2dAPI(OpTest):
+    def test_api(self):
+        x_NHWC = np.random.random([2, 5, 5, 3]).astype("float32")
+        x_NCHW = np.random.random([2, 3, 5, 5]).astype("float32")
+        input_NHWC = fluid.layers.data(
+            name="input_NHWC",
+            shape=[2, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        input_NCHW = fluid.layers.data(
+            name="input_NCHW",
+            shape=[2, 3, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3]
+        out_1 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding=[1, 1],
+            use_cudnn=False,
+            data_format="NHWC")
+        out_2 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            use_cudnn=False,
+            data_format="NHWC")
+        out_3 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
+            use_cudnn=False,
+            data_format="NCHW")
+        out_4 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[1, 2, 1, 0],
+            use_cudnn=False,
+            data_format="NCHW")
+        # test VALID
+        out_5 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NCHW")
+        out_6 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NHWC")
+        # test SAME
+        out_7 = fluid.layers.pool2d(
+            input=input_NCHW,
+            pool_size=[4, 4],
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NCHW")
+        out_8 = fluid.layers.pool2d(
+            input=input_NHWC,
+            pool_size=[4, 4],
+            pool_type="max",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NHWC")
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        [res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8] = exe.run(
+            fluid.default_main_program(),
+            feed={"input_NHWC": x_NHWC,
+                  "input_NCHW": x_NCHW},
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+        assert np.allclose(
+            res_1,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[1, 1],
+                data_format="NHWC"))
+        assert np.allclose(
+            res_2,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 1, 1, 1],
+                data_format="NHWC"))
+        assert np.allclose(
+            res_3,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 1, 1, 1],
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+        assert np.allclose(
+            res_4,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[1, 2, 1, 0],
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+        # VALID
+        assert np.allclose(
+            res_5,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[10, 20],  # any ele is ok
+                padding_algorithm="VALID",
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+        assert np.allclose(
+            res_6,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NHWC"))
+        # SAME
+        assert np.allclose(
+            res_7,
+            pool2D_forward_naive(
+                x=x_NCHW,
+                ksize=[4, 4],
+                pool_type="avg",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NCHW"),
+            rtol=0.07,
+            atol=1e-05)
+        assert np.allclose(
+            res_8,
+            pool2D_forward_naive(
+                x=x_NHWC,
+                ksize=[4, 4],
+                pool_type="max",
+                strides=[1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NHWC"))
+class TestPool2dAPI_Error(OpTest):
+    def test_api(self):
+        input_NHWC = fluid.layers.data(
+            name="input_NHWC",
+            shape=[2, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3]
+        # cudnn value error
+        def run_1():
+            out_1 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1],
+                use_cudnn=[0],
+                data_format="NHWC")
+        self.assertRaises(ValueError, run_1)
+        # data_format value error
+        def run_2():
+            out_2 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1],
+                use_cudnn=False,
+                data_format="NHWCC")
+        self.assertRaises(ValueError, run_2)
+        # padding str value error
+        def run_3():
+            out_3 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALIDSAME",
+                use_cudnn=False,
+                data_format="NHWC")
+        self.assertRaises(ValueError, run_3)
+        # padding str valid and ceil_mode value error
+        def run_4():
+            out_4 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALID",
+                use_cudnn=False,
+                ceil_mode=True,
+                data_format="NHWC")
+        self.assertRaises(ValueError, run_4)
+        # padding with 8 ele. value error
+        def run_5():
+            out_5 = fluid.layers.pool2d(
+                input=input_NHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1]],
+                use_cudnn=False,
+                data_format="NHWC")
+        self.assertRaises(ValueError, run_5)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle.fluid as fluid
 def adaptive_start_index(index, input_size, output_size):
@@ -30,58 +31,137 @@ def adaptive_end_index(index, input_size, output_size):
    return int(np.ceil((index + 1) * input_size / output_size))
-def max_pool3D_forward_naive(x,
+def pool3D_forward_naive(x,
                         ksize,
                         strides,
                         paddings,
                         global_pool=0,
                         ceil_mode=False,
                         exclusive=True,
-                             adaptive=False):
+                         adaptive=False,
-    N, C, D, H, W = x.shape
+                         data_format='NCDHW',
+                         pool_type='max',
+                         padding_algorithm="EXPLICIT"):
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCDHW":
+                input_data_shape = x.shape[2:5]
+            elif data_format == "NDHWC":
+                input_data_shape = x.shape[1:4]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+    assert len(paddings) == 3 or len(paddings) == 6
+    is_sys = True if len(paddings) == 3 else False
+    N = x.shape[0]
+    C,D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
    if global_pool == 1:
        ksize = [D, H, W]
+        paddings = [0 for _ in range(len(paddings))]
+    pad_d_forth = paddings[0] if is_sys else paddings[0]
+    pad_d_back = paddings[0] if is_sys else paddings[1]
+    pad_h_up = paddings[1] if is_sys else paddings[2]
+    pad_h_down = paddings[1] if is_sys else paddings[3]
+    pad_w_left = paddings[2] if is_sys else paddings[4]
+    pad_w_right = paddings[2] if is_sys else paddings[5]
    if adaptive:
        D_out, H_out, W_out = ksize
    else:
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        D_out = (D - ksize[0] + pad_d_forth+pad_d_back + strides[0] - 1) // strides[0] + 1 \
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+            if ceil_mode  else (D - ksize[0] + pad_d_forth+pad_d_back) // strides[0] + 1
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
+        H_out = (H - ksize[1] + pad_h_up + pad_h_down + strides[1] - 1) // strides[1] + 1 \
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+            if ceil_mode else (H - ksize[1] + pad_h_up + pad_h_down) // strides[1] + 1
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
+        W_out = (W - ksize[2] + pad_w_left + pad_w_right + strides[2] - 1) // strides[2] + 1 \
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+            if ceil_mode else (W - ksize[2] + pad_w_left + pad_w_right) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
    for k in range(D_out):
        if adaptive:
            d_start = adaptive_start_index(k, D, ksize[0])
            d_end = adaptive_end_index(k, D, ksize[0])
        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_start = np.max((k * strides[0] - pad_d_forth, 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+            d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth, D))
        for i in range(H_out):
            if adaptive:
                h_start = adaptive_start_index(i, H, ksize[1])
                h_end = adaptive_end_index(i, H, ksize[1])
            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_start = np.max((i * strides[1] - pad_h_up, 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
+                h_end = np.min((i * strides[1] + ksize[1] - pad_h_up, H))
            for j in range(W_out):
                if adaptive:
                    w_start = adaptive_start_index(j, W, ksize[2])
                    w_end = adaptive_end_index(j, W, ksize[2])
                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_start = np.max((j * strides[2] - pad_w_left, 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
+                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left, W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
+                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
+                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
    return out
-def avg_pool3D_forward_naive(x,
+def max_pool3D_forward_naive(x,
                             ksize,
                             strides,
                             paddings,
@@ -89,56 +169,46 @@ def avg_pool3D_forward_naive(x,
                             ceil_mode=False,
                             exclusive=True,
                             adaptive=False):
-    N, C, D, H, W = x.shape
+    out = pool3D_forward_naive(
-    if global_pool == 1:
+        x=x,
-        ksize = [D, H, W]
+        ksize=ksize,
-    if adaptive:
+        strides=strides,
-        D_out, H_out, W_out = ksize
+        paddings=paddings,
-    else:
+        global_pool=global_pool,
-        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
+        ceil_mode=ceil_mode,
-                 ) // strides[0] + 1 if ceil_mode else (
+        exclusive=exclusive,
-                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        adaptive=adaptive,
-        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+        data_format='NCDHW',
-                 ) // strides[1] + 1 if ceil_mode else (
+        pool_type="max")
-                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    return out
-        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-                 ) // strides[2] + 1 if ceil_mode else (
-                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
-    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in range(D_out):
-        if adaptive:
-            d_start = adaptive_start_index(k, D, ksize[0])
-            d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - paddings[0], 0))
-            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in range(H_out):
-            if adaptive:
-                h_start = adaptive_start_index(i, H, ksize[1])
-                h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - paddings[1], 0))
-                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
-            for j in range(W_out):
-                if adaptive:
-                    w_start = adaptive_start_index(j, W, ksize[2])
-                    w_end = adaptive_end_index(j, W, ksize[2])
-                else:
-                    w_start = np.max((j * strides[2] - paddings[2], 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
-                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
-                field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+def avg_pool3D_forward_naive(x,
-                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
+                             ksize,
-                                                            4)) / field_size
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=True,
+                             adaptive=False):
+    out = pool3D_forward_naive(
+        x=x,
+        ksize=ksize,
+        strides=strides,
+        paddings=paddings,
+        global_pool=global_pool,
+        ceil_mode=ceil_mode,
+        exclusive=exclusive,
+        adaptive=adaptive,
+        data_format='NCDHW',
+        pool_type="avg")
    return out
 class TestPool3d_Op(OpTest):
    def setUp(self):
        self.op_type = "pool3d"
-        self.use_cudnn = False
+        self.init_kernel_type()
        self.dtype = np.float32
        self.init_test_case()
        self.init_global_pool()
@@ -147,13 +217,15 @@ class TestPool3d_Op(OpTest):
        self.init_ceil_mode()
        self.init_exclusive()
        self.init_adaptive()
+        self.init_data_format()
+        self.init_shape()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool3D_forward_naive(
+        output = pool3D_forward_naive(
            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
+            self.pool_type).astype(self.dtype)
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
        self.attrs = {
@@ -164,8 +236,7 @@ class TestPool3d_Op(OpTest):
            'global_pooling': self.global_pool,
            'use_cudnn': self.use_cudnn,
            'ceil_mode': self.ceil_mode,
-            'data_format':
+            'data_format': self.data_format,
-            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
            'exclusive': self.exclusive,
            'adaptive': self.adaptive
        }
@@ -192,18 +263,23 @@ class TestPool3d_Op(OpTest):
        elif self.pool_type != "max":
            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
-    def init_test_case(self):
+    def init_data_format(self):
+        self.data_format = "NCDHW"
+    def init_shape(self):
        self.shape = [2, 3, 5, 5, 5]
+    def init_test_case(self):
        self.ksize = [3, 3, 3]
        self.strides = [1, 1, 1]
        self.paddings = [0, 0, 0]
    def init_kernel_type(self):
-        pass
+        self.use_cudnn = False
+        #pass
    def init_pool_type(self):
        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
    def init_global_pool(self):
        self.global_pool = True
@@ -219,30 +295,32 @@ class TestPool3d_Op(OpTest):
 class TestCase1(TestPool3d_Op):
-    def init_test_case(self):
+    def init_shape(self):
        self.shape = [2, 3, 7, 7, 7]
+    def init_test_case(self):
        self.ksize = [3, 3, 3]
        self.strides = [1, 1, 1]
        self.paddings = [0, 0, 0]
    def init_pool_type(self):
        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
    def init_global_pool(self):
        self.global_pool = False
 class TestCase2(TestPool3d_Op):
-    def init_test_case(self):
+    def init_shape(self):
        self.shape = [2, 3, 7, 7, 7]
+    def init_test_case(self):
        self.ksize = [3, 3, 3]
        self.strides = [1, 1, 1]
        self.paddings = [1, 1, 1]
    def init_pool_type(self):
        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
    def init_global_pool(self):
        self.global_pool = False
@@ -251,45 +329,45 @@ class TestCase2(TestPool3d_Op):
 class TestCase3(TestPool3d_Op):
    def init_pool_type(self):
        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
 class TestCase4(TestCase1):
    def init_pool_type(self):
        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
 class TestCase5(TestCase2):
    def init_pool_type(self):
        self.pool_type = "max"
-        self.pool3D_forward_naive = max_pool3D_forward_naive
-#--------------------test pool3d--------------------
+#--------------------test pool3d cudnn--------------------
-class TestCUDNNCase1(TestPool3d_Op):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-class TestFP16CUDNNCase1(TestPool3d_Op):
+def create_test_cudnn_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
-        self.dtype = np.float16
-    def test_check_output(self):
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOp")
-        if core.is_compiled_with_cuda():
+    TestCUDNNCase.__name__ = cls_name
-            place = core.CUDAPlace(0)
+    globals()[cls_name] = TestCUDNNCase
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-class TestCUDNNCase2(TestCase1):
+create_test_cudnn_class(TestPool3d_Op)
-    def init_kernel_type(self):
+create_test_cudnn_class(TestCase1)
-        self.use_cudnn = True
+create_test_cudnn_class(TestCase2)
+create_test_cudnn_class(TestCase3)
+create_test_cudnn_class(TestCase4)
+create_test_cudnn_class(TestCase5)
-class TestFP16CUDNNCase2(TestCase1):
+def create_test_cudnn_fp16_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNFp16Case(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
            self.dtype = np.float16
@@ -300,109 +378,775 @@ class TestFP16CUDNNCase2(TestCase1):
                if core.is_float16_supported(place):
                    self.check_output_with_place(place, atol=1e-3)
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
+    TestCUDNNFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNFp16Case
-class TestCUDNNCase3(TestCase2):
-    def init_kernel_type(self):
+create_test_cudnn_fp16_class(TestPool3d_Op)
-        self.use_cudnn = True
+create_test_cudnn_fp16_class(TestCase1)
+create_test_cudnn_fp16_class(TestCase2)
+create_test_cudnn_fp16_class(TestCase3)
+create_test_cudnn_fp16_class(TestCase4)
+create_test_cudnn_fp16_class(TestCase5)
-class TestFP16CUDNNCase3(TestCase2):
+# ---- test ceil mode ------
+def create_test_cudnn_use_ceil_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestPool3DUseCeilCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
-        self.dtype = np.float16
-    def test_check_output(self):
+        def init_ceil_mode(self):
-        if core.is_compiled_with_cuda():
+            self.ceil_mode = True
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNOpCeilMode")
+    TestPool3DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestPool3DUseCeilCase
-class TestCUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+create_test_cudnn_use_ceil_class(TestPool3d_Op)
+create_test_cudnn_use_ceil_class(TestCase1)
-class TestFP16CUDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-    def test_check_output(self):
+def create_test_use_ceil_class(parent):
-        if core.is_compiled_with_cuda():
+    class TestPool3DUseCeilCase(parent):
-            place = core.CUDAPlace(0)
+        def init_ceil_mode(self):
-            if core.is_float16_supported(place):
+            self.ceil_mode = True
-                self.check_output_with_place(place, atol=1e-3)
+    cls_name = "{0}_{1}".format(parent.__name__, "CeilModeCast")
+    TestPool3DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestPool3DUseCeilCase
-class TestCUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
+create_test_use_ceil_class(TestCase1)
+create_test_use_ceil_class(TestCase2)
-class TestFP16CUDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_cudnn = True
-        self.dtype = np.float16
-    def test_check_output(self):
+class TestAvgInclude(TestCase2):
-        if core.is_compiled_with_cuda():
+    def init_exclusive(self):
-            place = core.CUDAPlace(0)
+        self.exclusive = False
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-class TestCUDNNCase6(TestCase5):
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude(TestCase2):
    def init_kernel_type(self):
        self.use_cudnn = True
+    def init_exclusive(self):
+        self.exclusive = False
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+#-------test pool3d with asymmetric padding------
+class TestPool3d_Op_AsyPadding(TestPool3d_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0, 2, 3, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
+class TestCase1_AsyPadding(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+class TestCase2_AsyPadding(TestCase2):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+class TestCase3_AsyPadding(TestCase3):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 0, 0, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
+class TestCase4_AsyPadding(TestCase4):
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+class TestCase5_AsyPadding(TestCase5):
+    def init_test_case(self):
+        self.shape = [2, 7, 7, 7, 3]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
+create_test_cudnn_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_class(TestCase1_AsyPadding)
+create_test_cudnn_class(TestCase2_AsyPadding)
+create_test_cudnn_class(TestCase3_AsyPadding)
+create_test_cudnn_class(TestCase4_AsyPadding)
+create_test_cudnn_class(TestCase5_AsyPadding)
+create_test_cudnn_fp16_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding)
+create_test_cudnn_fp16_class(TestCase2_AsyPadding)
+create_test_cudnn_fp16_class(TestCase3_AsyPadding)
+create_test_cudnn_fp16_class(TestCase4_AsyPadding)
+create_test_cudnn_fp16_class(TestCase5_AsyPadding)
+create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase2_AsyPadding)
+class TestAvgInclude_AsyPadding(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+    def init_test_case(self):
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 2, 1, 1, 1, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
-class TestFP16CUDNNCase6(TestCase5):
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_AsyPadding(TestCase2):
    def init_kernel_type(self):
        self.use_cudnn = True
-        self.dtype = np.float16
-    def test_check_output(self):
+    def init_exclusive(self):
-        if core.is_compiled_with_cuda():
+        self.exclusive = False
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
+    def init_test_case(self):
-                self.check_output_with_place(place, atol=1e-3)
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 0, 0, 0, 0]
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5, 5]
-class TestCeilModeCase1(TestCUDNNCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
-class TestCeilModeCase2(TestCUDNNCase2):
+    def init_test_case(self):
-    def init_ceil_mode(self):
+        self.ksize = [3, 3, 3]
-        self.ceil_mode = True
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 0, 2, 1, 2, 1]
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7, 7]
-class TestCeilModeCase3(TestCase1):
-    def init_ceil_mode(self):
-        self.ceil_mode = True
+# ------------ test channel_last --------------
+class TestPool3d_channel_last(TestPool3d_Op):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
-class TestCeilModeCase4(TestCase2):
+    def init_shape(self):
-    def init_ceil_mode(self):
+        self.shape = [2, 5, 5, 5, 3]
-        self.ceil_mode = True
-class TestAvgInclude(TestCase2):
+class TestCase1_channel_last(TestCase1):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase2_channel_last(TestCase2):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase3_channel_last(TestCase3):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+class TestCase4_channel_last(TestCase4):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase5_channel_last(TestCase5):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+create_test_cudnn_class(TestPool3d_channel_last)
+create_test_cudnn_class(TestCase1_channel_last)
+create_test_cudnn_class(TestCase2_channel_last)
+create_test_cudnn_class(TestCase3_channel_last)
+create_test_cudnn_class(TestCase4_channel_last)
+create_test_cudnn_class(TestCase5_channel_last)
+create_test_cudnn_use_ceil_class(TestPool3d_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase2_channel_last)
+class TestCase5_Max(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        if self.has_cudnn() and self.pool_type == "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=1.00)
+        elif self.pool_type == "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
+class TestCase5_channel_last_Max(TestCase5_Max):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+create_test_cudnn_class(TestCase5_Max)
+create_test_cudnn_class(TestCase5_channel_last_Max)
+class TestAvgInclude_channel_last(TestCase2_channel_last):
    def init_exclusive(self):
        self.exclusive = False
-class TestCUDNNAvgInclude(TestCUDNNCase3):
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+    def init_kernel_type(self):
+        self.use_cudnn = True
    def init_exclusive(self):
        self.exclusive = False
-class TestAvgPoolAdaptive(TestCase1):
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
    def init_adaptive(self):
        self.adaptive = True
+# --- asy padding
+class TestPool3d_Op_AsyPadding_channel_last(TestPool3d_Op_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+create_test_cudnn_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase1_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase2_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase3_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase4_AsyPadding_channel_last)
+create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
+class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNAvgInclude_AsyPadding_channel_last(
+        TestCUDNNAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 5, 5, 5, 3]
+class TestAvgPoolAdaptive_AsyPadding_channel_last(
+        TestAvgPoolAdaptive_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NDHWC"
+    def init_shape(self):
+        self.shape = [2, 7, 7, 7, 3]
+#test padding = SAME VALID
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.paddings = [0, 0]
+            self.padding_algorithm = "SAME"
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+create_test_padding_SAME_class(TestPool3d_Op)
+create_test_padding_SAME_class(TestCase1)
+create_test_padding_SAME_class(TestCase2)
+create_test_padding_SAME_class(TestCase3)
+create_test_padding_SAME_class(TestCase4)
+create_test_padding_SAME_class(TestCase5)
+create_test_padding_SAME_class(TestPool3d_channel_last)
+create_test_padding_SAME_class(TestCase1_channel_last)
+create_test_padding_SAME_class(TestCase2_channel_last)
+create_test_padding_SAME_class(TestCase3_channel_last)
+create_test_padding_SAME_class(TestCase4_channel_last)
+create_test_padding_SAME_class(TestCase5_channel_last)
+def create_test_cudnn_padding_SAME_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingSMAECase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "SAME"
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingSAMEOp")
+    TestCUDNNPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingSMAECase
+create_test_cudnn_padding_SAME_class(TestPool3d_Op)
+create_test_cudnn_padding_SAME_class(TestCase1)
+create_test_cudnn_padding_SAME_class(TestCase2)
+create_test_cudnn_padding_SAME_class(TestCase3)
+create_test_cudnn_padding_SAME_class(TestCase4)
+create_test_cudnn_padding_SAME_class(TestCase5)
+create_test_cudnn_padding_SAME_class(TestPool3d_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase1_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase2_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase3_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase4_channel_last)
+create_test_cudnn_padding_SAME_class(TestCase5_channel_last)
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+create_test_padding_VALID_class(TestPool3d_Op)
+create_test_padding_VALID_class(TestCase1)
+create_test_padding_VALID_class(TestCase2)
+create_test_padding_VALID_class(TestCase3)
+create_test_padding_VALID_class(TestCase4)
+create_test_padding_VALID_class(TestCase5)
+create_test_padding_VALID_class(TestPool3d_channel_last)
+create_test_padding_VALID_class(TestCase1_channel_last)
+create_test_padding_VALID_class(TestCase2_channel_last)
+create_test_padding_VALID_class(TestCase3_channel_last)
+create_test_padding_VALID_class(TestCase4_channel_last)
+create_test_padding_VALID_class(TestCase5_channel_last)
+def create_test_cudnn_padding_VALID_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestCUDNNPaddingVALIDCase(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+    cls_name = "{0}_{1}".format(parent.__name__, "CudnnPaddingVALIDOp")
+    TestCUDNNPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestCUDNNPaddingVALIDCase
+create_test_cudnn_padding_VALID_class(TestPool3d_Op)
+create_test_cudnn_padding_VALID_class(TestCase1)
+create_test_cudnn_padding_VALID_class(TestCase2)
+create_test_cudnn_padding_VALID_class(TestCase3)
+create_test_cudnn_padding_VALID_class(TestCase4)
+create_test_cudnn_padding_VALID_class(TestCase5)
+create_test_cudnn_padding_VALID_class(TestPool3d_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase1_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase2_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase3_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase4_channel_last)
+create_test_cudnn_padding_VALID_class(TestCase5_channel_last)
+#test API
+class TestPool3dAPI(OpTest):
+    def test_api(self):
+        x_NDHWC = np.random.random([2, 5, 5, 5, 3]).astype("float32")
+        x_NCDHW = np.random.random([2, 3, 5, 5, 5]).astype("float32")
+        input_NDHWC = fluid.layers.data(
+            name="input_NDHWC",
+            shape=[2, 5, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        input_NCDHW = fluid.layers.data(
+            name="input_NCDHW",
+            shape=[2, 3, 5, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3, 3]
+        out_1 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="max",
+            pool_padding=[1, 1, 1],
+            use_cudnn=False,
+            data_format="NDHWC")
+        out_2 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
+            use_cudnn=False,
+            data_format="NDHWC")
+        out_3 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
+            use_cudnn=False,
+            data_format="NCDHW")
+        out_4 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding=[1, 2, 1, 0, 0, 1],
+            use_cudnn=False,
+            data_format="NCDHW")
+        # test VALID
+        out_5 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NDHWC")
+        out_6 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="VALID",
+            use_cudnn=False,
+            data_format="NCDHW")
+        # test SAME
+        out_7 = fluid.layers.pool3d(
+            input=input_NDHWC,
+            pool_size=ksize,
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NDHWC")
+        out_8 = fluid.layers.pool3d(
+            input=input_NCDHW,
+            pool_size=[4, 4, 4],
+            pool_type="avg",
+            pool_padding="SAME",
+            use_cudnn=False,
+            data_format="NCDHW")
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        [res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8] = exe.run(
+            fluid.default_main_program(),
+            feed={"input_NDHWC": x_NDHWC,
+                  "input_NCDHW": x_NCDHW},
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+        assert np.allclose(
+            res_1,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="max",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1],
+                data_format="NDHWC"))
+        assert np.allclose(
+            res_2,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1, 1, 1, 1],
+                data_format="NDHWC"))
+        assert np.allclose(
+            res_3,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 1, 1, 1, 1, 1],
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+        assert np.allclose(
+            res_4,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[1, 2, 1, 0, 0, 1],
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+        # VALID
+        assert np.allclose(
+            res_5,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NDHWC"))
+        assert np.allclose(
+            res_6,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="VALID",
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+        # SAME
+        assert np.allclose(
+            res_7,
+            pool3D_forward_naive(
+                x=x_NDHWC,
+                ksize=ksize,
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NDHWC"))
+        assert np.allclose(
+            res_8,
+            pool3D_forward_naive(
+                x=x_NCDHW,
+                ksize=[4, 4, 4],
+                pool_type="avg",
+                strides=[1, 1, 1],
+                paddings=[10, 20],
+                padding_algorithm="SAME",
+                data_format="NCDHW"),
+            rtol=0.07,
+            atol=1e-05)
+class TestPool3dAPI_Error(OpTest):
+    def test_api(self):
+        input_NDHWC = fluid.layers.data(
+            name="input_NDHWC",
+            shape=[2, 5, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+        ksize = [3, 3, 3]
+        # cudnn value error
+        def run_1():
+            out_1 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1, 1],
+                use_cudnn=[0],
+                data_format="NDHWC")
+        self.assertRaises(ValueError, run_1)
+        # data_format value error
+        def run_2():
+            out_2 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[1, 1, 1],
+                use_cudnn=False,
+                data_format="NDHWCC")
+        self.assertRaises(ValueError, run_2)
+        # padding str value error
+        def run_3():
+            out_3 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALIDSAME",
+                use_cudnn=False,
+                data_format="NDHWC")
+        self.assertRaises(ValueError, run_3)
+        # padding str valid and ceil_mode value error
+        def run_4():
+            out_4 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding="VALID",
+                use_cudnn=False,
+                ceil_mode=True,
+                data_format="NDHWC")
+        self.assertRaises(ValueError, run_4)
+        # padding with 8 ele. value error
+        def run_5():
+            out_5 = fluid.layers.pool3d(
+                input=input_NDHWC,
+                pool_size=ksize,
+                pool_type="max",
+                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1], [1, 1]],
+                use_cudnn=False,
+                data_format="NDHWC")
+        self.assertRaises(ValueError, run_5)
 if __name__ == '__main__':
    unittest.main()