Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my_maxout_op

350cc61f · sweetsky0901 · 4e5c9896 · 9db4d019 · 350cc61f · 350cc61f
10 changed file
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -5,6 +5,7 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+use_gpu = get_config_arg('use_gpu', bool, True)

 args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
 define_py_data_sources2(
@@ -16,6 +17,8 @@ settings(
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))

+conv_projection = conv_projection if use_gpu else img_conv_layer
+
 def inception2(name, input, channels, \
    filter1,
    filter3R, filter3,
@@ -138,7 +141,7 @@ def inception(name, input, channels, \
    cat = concat_layer(
        name=name,
        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
        act=ReluActivation())
    return cat


--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -40,6 +40,7 @@ fi
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
+    train googlenet v1 $batchsize $use_mkldnn
  done
 done
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -498,8 +498,8 @@ template class Pool3dGradFunctor<
 * Ksize, strides, paddings are two elements. These two elements represent
 * height and width, respectively.
 */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& input, std::vector<int>& ksize,
@@ -520,9 +520,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;

-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -535,7 +535,7 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
            int wend = std::min(wstart + ksize_width, input_width);
            wstart = std::max(wstart, 0);

-            T ele = static_cast<T>(-FLT_MAX);
+            T1 ele = static_cast<T1>(-FLT_MAX);
            int index = -1;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
@@ -563,8 +563,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
 * Ksize, strides, paddings are two elements. These two elements represent
 * height and width, respectively.
 */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& output_grad,
@@ -580,9 +580,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;

-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {
@@ -602,18 +602,18 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
  }
 };

-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;

 /*
 * All tensors are in NCDHW format.
 * Ksize, strides, paddings are three elements. These three elements represent
 * depth, height and width, respectively.
 */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& input, std::vector<int>& ksize,
@@ -639,9 +639,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
    const int input_stride = input_depth * input_height * input_width;
    const int output_stride = output_depth * output_height * output_width;

-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -659,7 +659,7 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
              wstart = std::max(wstart, 0);

              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = static_cast<T>(-FLT_MAX);
+              T1 ele = static_cast<T1>(-FLT_MAX);
              int index = -1;
              for (int d = dstart; d < dend; ++d) {
                for (int h = hstart; h < hend; ++h) {
@@ -691,8 +691,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
 * Ksize, strides, paddings are three elements. These three elements represent
 * depth, height and width, respectively.
 */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& output_grad,
@@ -710,9 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
    const int input_stride = input_depth * input_height * input_width;
    const int output_stride = output_depth * output_height * output_width;

-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {
@@ -735,10 +735,10 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
  }
 };

-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -658,13 +658,13 @@ template class Pool3dGradFunctor<
 template class Pool3dGradFunctor<
    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;

-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T* input_data, const int channels,
+    const int nthreads, const T1* input_data, const int channels,
    const int input_height, const int input_width, const int output_height,
    const int output_width, const int ksize_height, const int ksize_width,
    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T* output_data, T* mask_data) {
+    const int padding_width, T1* output_data, T2* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -681,7 +681,7 @@ __global__ void KernelMaxPool2dWithIdx(
    wstart = max(wstart, 0);

    input_data += (batch_idx * channels + c) * input_height * input_width;
-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
    int max_index = -1;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
@@ -697,13 +697,13 @@ __global__ void KernelMaxPool2dWithIdx(
  }
 }

-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, const T* output_grad, const T* mask_data,
+    const int nthreads, const T1* output_grad, const T2* mask_data,
    const int channels, const int input_height, const int input_width,
    const int output_height, const int output_width, const int ksize_height,
    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T* input_grad) {
+    const int padding_height, const int padding_width, T1* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
@@ -724,7 +724,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(
    int pw_end =
        min((w_offset + padding_width) / stride_width + 1, output_width);

-    T gradient = 0;
+    T1 gradient = 0;
    int input_current_featuremap_idx = h_offset * input_width + w_offset;
    int output_idx =
        (batch_idx * channels + c_offset) * output_height * output_width;
@@ -746,8 +746,8 @@ __global__ void KernelMaxPool2DWithIdxGrad(
 * Ksize, strides, paddings are two elements. These two elements represent
 * height and width, respectively.
 */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& input, std::vector<int>& ksize,
@@ -767,9 +767,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];

-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_height * output_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -777,9 +777,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
    dim3 grid(blocks, 1);

    KernelMaxPool2dWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
        nthreads, input_data, input_channels, input_height, input_width,
        output_height, output_width, ksize_height, ksize_width, stride_height,
        stride_width, padding_height, padding_width, output_data, mask_data);
@@ -791,8 +791,8 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
 * Ksize, strides, paddings are two elements. These two elements represent
 * height and width, respectively.
 */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& output_grad,
@@ -812,9 +812,9 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
    const int padding_height = paddings[0];
    const int padding_width = paddings[1];

-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());

    int nthreads = batch_size * input_channels * input_height * input_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -822,30 +822,30 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
    dim3 grid(blocks, 1);

    KernelMaxPool2DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, output_grad_data, mask_data,
-                              input_channels, input_height, input_width,
-                              output_height, output_width, ksize_height,
-                              ksize_width, stride_height, stride_width,
-                              padding_height, padding_width, input_grad_data);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_height,
+        input_width, output_height, output_width, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
  }
 };

-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;

-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T* input_data, const int channels,
+    const int nthreads, const T1* input_data, const int channels,
    const int input_depth, const int input_height, const int input_width,
    const int output_depth, const int output_height, const int output_width,
    const int ksize_depth, const int ksize_height, const int ksize_width,
    const int stride_depth, const int stride_height, const int stride_width,
    const int padding_depth, const int padding_height, const int padding_width,
-    T* output_data, T* mask_data) {
+    T1* output_data, T2* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -865,7 +865,7 @@ __global__ void KernelMaxPool3DWithIdx(
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);

-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
    int max_index = -1;
    input_data +=
        (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -885,15 +885,15 @@ __global__ void KernelMaxPool3DWithIdx(
  }
 }

-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, const T* output_grad, const T* mask, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    T* input_grad) {
+    const int nthreads, const T1* output_grad, const T2* mask,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width, T1* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
@@ -922,7 +922,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
    int pw_end =
        min((w_offset + padding_width) / stride_width + 1, output_width);

-    T gradient = 0;
+    T1 gradient = 0;
    int input_current_feature_map_idx =
        (d_offset * input_height + h_offset) * input_width + w_offset;
    int output_idx = (batch_idx * channels + c_offset) * output_depth *
@@ -949,8 +949,8 @@ __global__ void KernelMaxPool3DWithIdxGrad(
 * Ksize, strides, paddings are three elements. These three elements represent
 * depth, height and width, respectively.
 */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& input, std::vector<int>& ksize,
@@ -975,9 +975,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
    const int padding_height = paddings[1];
    const int padding_width = paddings[2];

-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask->mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_depth * output_height *
                   output_width;
@@ -986,9 +986,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
    dim3 grid(blocks, 1);

    KernelMaxPool3DWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
        nthreads, input_data, input_channels, input_depth, input_height,
        input_width, output_depth, output_height, output_width, ksize_depth,
        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -1001,8 +1001,8 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
 * Ksize, strides, paddings are three elements. These three elements represent
 * depth, height and width, respectively.
 */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
 public:
  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor& output_grad,
@@ -1027,9 +1027,9 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
    const int padding_height = paddings[1];
    const int padding_width = paddings[2];

-    const T* output_grad_data = output_grad.data<T>();
-    const T* mask_data = mask.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    const T1* output_grad_data = output_grad.data<T1>();
+    const T2* mask_data = mask.data<T2>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());

    int nthreads =
        batch_size * input_channels * input_depth * input_height * input_width;
@@ -1038,9 +1038,9 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
    dim3 grid(blocks, 1);

    KernelMaxPool3DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
        nthreads, output_grad_data, mask_data, input_channels, input_depth,
        input_height, input_width, output_depth, output_height, output_width,
        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
@@ -1049,10 +1049,10 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
  }
 };

-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -153,7 +153,7 @@ class MaxPool3dGradFunctor {
 * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
 * NCDHW format.
 */
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
@@ -162,7 +162,7 @@ class MaxPool2dWithIndexFunctor {
                  framework::Tensor* output, framework::Tensor* mask);
 };

-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
@@ -172,7 +172,7 @@ class MaxPool2dWithIndexGradFunctor {
                  framework::Tensor* input_grad);
 };

-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
@@ -181,7 +181,7 @@ class MaxPool3dWithIndexFunctor {
                  framework::Tensor* output, framework::Tensor* mask);
 };

-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -29,11 +29,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of Pooling should not be null.");
+                   "Input(X) of Pooling should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Out(Output) of Pooling should not be null.");
+                   "Output(Out) of Pooling should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
-                   "Mask(Output) of Pooling should not be null.");
+                   "Output(Mask) of Pooling should not be null.");

    auto in_x_dims = ctx->GetInputDim("X");

@@ -67,6 +67,14 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };

 class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
@@ -80,6 +88,14 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
                   "Input(X@GRAD) should not be null.");
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };

 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,7 +132,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    // TypedAttrChecker don't support vector type.)
    AddAttr<bool>(
        "global_pooling",
-        "(bool, default false) Whether to use the global pooling. "
+        "(bool, default:false) Whether to use the global pooling. "
        "If global_pooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
@@ -126,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling "
        "operator. "
        "If global_pooling = true, paddings and will be ignored.")
        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -250,12 +266,12 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,

 REGISTER_OP_CPU_KERNEL(
    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)

 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -263,9 +279,9 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,

 REGISTER_OP_CPU_KERNEL(
    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -18,18 +18,18 @@ namespace ops = paddle::operators;

 REGISTER_OP_GPU_KERNEL(
    max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
    max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)

 REGISTER_OP_GPU_KERNEL(
    max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
    max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,8 +24,8 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename Place, typename T>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* in_x = context.Input<Tensor>("X");
@@ -44,13 +44,13 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {

    switch (ksize.size()) {
      case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
            pool2d_forward;
        pool2d_forward(context.device_context(), *in_x, ksize, strides,
                       paddings, out, mask);
      } break;
      case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
            pool3d_forward;
        pool3d_forward(context.device_context(), *in_x, ksize, strides,
                       paddings, out, mask);
@@ -60,8 +60,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
  }
 };

-template <typename Place, typename T>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* mask = context.Input<Tensor>("Mask");
@@ -80,19 +80,19 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
    }

    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
+      in_x_grad->mutable_data<T1>(context.GetPlace());
      auto& device_ctx = context.device_context();
      math::set_constant(device_ctx, in_x_grad, 0);

      switch (ksize.size()) {
        case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
              pool2d_backward;
          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                          paddings, in_x_grad);
        } break;
        case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
              pool3d_backward;
          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                          paddings, in_x_grad);

--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -661,7 +661,7 @@ def conv2d(input,
    if groups is None:
        num_filter_channels = num_channels
    else:
-        if num_channels % groups is not 0:
+        if num_channels % groups != 0:
            raise ValueError("num_channels must be divisible by groups.")
        num_filter_channels = num_channels / groups


--- a/python/paddle/v2/fluid/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
@@ -3,11 +3,13 @@ import numpy as np
 from op_test import OpTest


-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):

    N, C, D, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
        ksize = [D, H, W]
+        paddings = [0, 0, 0]
+
    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
@@ -40,11 +42,13 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
    return out, mask


-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):

    N, C, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
        ksize = [H, W]
+        paddings = [0, 0]
+
    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
@@ -74,13 +78,13 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 class TestMaxPoolWithIndex_Op(OpTest):
    def setUp(self):
        self.init_test_case()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
+        self.init_global()
+
        input = np.random.random(self.shape).astype("float32")
        output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
                                               self.paddings, self.global_pool)
        output = output.astype("float32")
-        mask = mask.astype("float32")
+        mask = mask.astype("int32")

        self.attrs = {
            'strides': self.strides,
@@ -99,41 +103,24 @@ class TestMaxPoolWithIndex_Op(OpTest):
    #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)

    def init_test_case(self):
-        self.global_pool = True
-        self.index = "max_pool3d_with_index"
-        self.op_type = "%s" % self.index
+        self.op_type = "max_pool3d_with_index"
        self.pool_forward_naive = max_pool3D_forward_naive
        self.shape = [2, 3, 5, 5, 5]
        self.ksize = [3, 3, 3]
        self.strides = [1, 1, 1]
        self.paddings = [1, 1, 1]

+    def init_global(self):
+        self.global_pool = False
+

 class TestCase1(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
        self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]


 class TestCase2(TestMaxPoolWithIndex_Op):
    def init_test_case(self):
-        self.global_pool = False
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
-
-
-class TestCase3(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = False
        self.op_type = "max_pool3d_with_index"
        self.pool_forward_naive = max_pool3D_forward_naive
        self.shape = [2, 3, 7, 7, 7]
@@ -141,32 +128,18 @@ class TestCase3(TestMaxPoolWithIndex_Op):
        self.strides = [2, 2, 2]
        self.paddings = [0, 0, 0]

-
-class TestCase4(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
        self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]


-class TestCase5(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [2, 2, 2]
-        self.paddings = [0, 0, 0]
+class TestCase3(TestCase2):
+    def init_global(self):
+        self.global_pool = False


-class TestCase6(TestMaxPoolWithIndex_Op):
+#----------------max_pool2d_with_index----------------
+class TestCase4(TestMaxPoolWithIndex_Op):
    def init_test_case(self):
-        self.global_pool = False
        self.op_type = "max_pool2d_with_index"
        self.pool_forward_naive = max_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
@@ -174,10 +147,17 @@ class TestCase6(TestMaxPoolWithIndex_Op):
        self.strides = [1, 1]
        self.paddings = [1, 1]

+    def init_global(self):
+        self.global_pool = True
+

-class TestCase7(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+class TestCase5(TestCase4):
+    def init_global(self):
        self.global_pool = False
+
+
+class TestCase6(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
        self.op_type = "max_pool2d_with_index"
        self.pool_forward_naive = max_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
@@ -185,27 +165,13 @@ class TestCase7(TestMaxPoolWithIndex_Op):
        self.strides = [2, 2]
        self.paddings = [0, 0]

-
-class TestCase8(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
        self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]


-class TestCase9(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [2, 2]
-        self.paddings = [0, 0]
+class TestCase7(TestCase6):
+    def init_global(self):
+        self.global_pool = False


 if __name__ == '__main__':