From 191fafe355717c506663b5ddd13d8a18ae944924 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sun, 9 Oct 2016 13:53:30 +0800
Subject: [PATCH] support rectangle padding, stride, window and input for
 PoolProjection (#115)

* support rectangle padding, stride, window and input for PoolProjection

* Follow comments.
1. Remove start
2. refine img_pool_a/b.conf for test_NetworkCompare
3. Split unit test

* Modify the test in img_layers.py
---
 paddle/cuda/include/hl_cnn.h                  |  88 ++++---
 paddle/cuda/include/stub/hl_cnn_stub.h        |  48 ++--
 paddle/cuda/src/hl_cuda_cnn.cu                | 237 +++++++++++-------
 paddle/gserver/layers/CudnnPoolLayer.cpp      |   1 -
 paddle/gserver/layers/CudnnPoolLayer.h        |  10 -
 paddle/gserver/layers/PoolLayer.cpp           |  17 --
 paddle/gserver/layers/PoolLayer.h             |  12 +-
 paddle/gserver/layers/PoolProjectionLayer.cpp |  28 ++-
 paddle/gserver/tests/img_pool_a.conf          |  46 ++++
 paddle/gserver/tests/img_pool_b.conf          |  44 ++++
 paddle/gserver/tests/test_LayerGrad.cpp       |  36 +--
 paddle/gserver/tests/test_NetworkCompare.cpp  |  14 ++
 paddle/math/Matrix.cpp                        | 155 +++++++-----
 paddle/math/Matrix.h                          |  79 ++++--
 paddle/math/tests/test_matrixCompare.cpp      | 153 +++++++++++
 proto/ModelConfig.proto.m4                    |   3 +-
 python/paddle/trainer/config_parser.py        |  25 +-
 .../paddle/trainer_config_helpers/layers.py   |  43 +++-
 .../paddle/trainer_config_helpers/networks.py |  12 +-
 .../paddle/trainer_config_helpers/poolings.py |  23 +-
 .../tests/configs/check.md5                   |   2 +-
 .../tests/configs/img_layers.py               |   6 +-
 22 files changed, 757 insertions(+), 325 deletions(-)
 create mode 100644 paddle/gserver/tests/img_pool_a.conf
 create mode 100644 paddle/gserver/tests/img_pool_b.conf

diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index dcae62d06b2..5d750333e1e 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -84,16 +84,23 @@ extern void hl_expand_feature2col(
  * @param[in]   width       image width.
  * @param[in]   pooledH     output image height.
  * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  *
  */
 extern void hl_maxpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData);
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData);
 
 /**
  * @brief   Maximum pool backward.
@@ -107,21 +114,28 @@ extern void hl_maxpool_forward(
  * @param[in]   width       image width.
  * @param[in]   pooledH     output image height.
  * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
- * @param[out]  targetGrad  output grad.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
  *
  */
 extern void hl_maxpool_backward(
-    int frameCnt, const real* inputData,
+    const int frameCnt, const real* inputData,
     const real* outData, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* targetGrad,
-    real scaleA, real scaleB);
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW,
+    real scaleA, real scaleB,
+    real* targetGrad);
 
 /**
  * @brief   Averge pool forward.
@@ -133,16 +147,23 @@ extern void hl_maxpool_backward(
  * @param[in]   width       image width.
  * @param[in]   pooledH     output image height.
  * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  *
  */
 extern void hl_avgpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData);
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData);
 
 /**
  * @brief   Maximum pool backward.
@@ -154,20 +175,27 @@ extern void hl_avgpool_forward(
  * @param[in]   width       image width.
  * @param[in]   pooledH     output image height.
  * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
- * @param[out]  backGrad    output grad.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
  *
  */
 extern void hl_avgpool_backward(
-    int frameCnt, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* backGrad,
-    real scaleA, real scaleB);
+    const int frameCnt, const real* outGrad,
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    int paddingH, int paddingW,
+    real scaleA, real scaleB,
+    real* backGrad);
 
 /**
  * @brief   Cross-map-respose normalize forward.
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index e4d46e4fb18..38e359c3eb2 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -38,29 +38,45 @@ inline void hl_expand_feature2col(
     real* dataCol) {}
 
 inline void hl_maxpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData) {}
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData) {}
 
 inline void hl_maxpool_backward(
-    int frameCnt, const real* inputData,
+    const int frameCnt, const real* inputData,
     const real* outData, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* targetGrad,
-    real scaleA, real scaleB) {}
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW,
+    real scaleA, real scaleB,
+    real* targetGrad) {}
 
 inline void hl_avgpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData) {}
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData) {}
 
 inline void hl_avgpool_backward(
-    int frameCnt, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* backGrad,
-    real scaleA, real scaleB) {}
+    const int frameCnt, const real* outGrad,
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    int paddingH, int paddingW,
+    real scaleA, real scaleB,
+    real* backGrad) {}
 
 inline void hl_CMRNorm_forward(
     size_t frameCnt, const real* in, real* scale, real* out,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b3695a2c7f8..abac83a3e04 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -145,24 +145,28 @@ void hl_shrink_col2feature(const real * dataCol, size_t channels,
   CHECK_SYNC("hl_shrink_col2feature failed");
 }
 
-__global__ void KeMaxPoolForward(int nthreads, const real* inputData,
-                                 int channels, int height, int width,
-                                 int pooledH, int pooledW,
-                                 int ksize, int stride, int start,
+__global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
+                                 const int channels, const int height,
+                                 const int width,
+                                 const int pooledH, const int pooledW,
+                                 const int ksizeW, const int ksizeH,
+                                 const int strideH, const int strideW,
+                                 const int offsetH, const int offsetW,
                                  real* tgtData) {
-  int index =  blockIdx.y * blockDim.x + threadIdx.x;
+  int index =  blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
     int ph = (index / pooledW) % pooledH;
     int c = (index / pooledW / pooledH) % channels;
-    int frameNum = blockIdx.x;
-    int hstart = ph * stride + start;
-    int hend = min(hstart + ksize, height);
-    int wstart = pw * stride + start;
-    int wend = min(wstart + ksize, width);
+    int frameNum = index / pooledW / pooledH / channels;
+    int hstart = ph * strideH - offsetH;
+    int wstart = pw * strideW - offsetW;
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
     inputData += (frameNum * channels + c) * height * width;
-    tgtData += (frameNum * channels) * pooledW * pooledH;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         if (maxval < inputData[h * width + w])
@@ -173,44 +177,54 @@ __global__ void KeMaxPoolForward(int nthreads, const real* inputData,
   }
 }
 
-void hl_maxpool_forward(int frameCnt, const real* inputData, int channels,
-                        int height, int width, int pooledH, int pooledW,
-                        int sizeX, int stride, int start, real* tgtData) {
-  int num_kernels = pooledH * pooledW * channels;
-  int blocksX = frameCnt;
-  int blocksY = (num_kernels + 1024 -1) / 1024;
+void hl_maxpool_forward(const int frameCnt, const real* inputData,
+                        const int channels,
+                        const int height, const int width,
+                        const int pooledH, const int pooledW,
+                        const int sizeX, const int sizeY,
+                        const int strideH, const int strideW,
+                        const int paddingH, const int paddingW,
+                        real* tgtData) {
+
+  int num_kernels = pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
+  dim3 grid(blocks, 1);
+
   KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels, height, width,
-           pooledH, pooledW, sizeX, stride, start, tgtData);
+           pooledH, pooledW, sizeX, sizeY, strideH, strideW,
+           paddingH, paddingW, tgtData);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
-__global__ void KeMaxPoolBackward(int nthreads, const real* inputData,
+__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
                                   const real* outData, const real* outGrad,
-                                  int channels, int height, int width,
-                                  int pooledH, int pooledW, int sizeX,
-                                  int stride, int start, real* targetGrad,
-                                  real scaleA, real scaleB) {
-  int index = blockIdx.y  * blockDim.x + threadIdx.x;
+                                  const int channels, const int height,
+                                  const int width,
+                                  const int pooledH, const int pooledW,
+                                  const int sizeX, const int sizeY,
+                                  const int strideH, const int strideW,
+                                  const int padH, const int padW,
+                                  real scaleA, real scaleB,
+                                  real* targetGrad) {
+  int index = blockIdx.x  * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
     // find out the local offset
-    int offsetW = index % width + start;
-    int offsetH = (index / width) % height + start;
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
     int offsetC = (index / width / height) % channels;
-    int frameNum = blockIdx.x;
-    int phstart = (offsetH < sizeX) ? 0 : (offsetH - sizeX) / stride + 1;
-    int phend = min(offsetH / stride + 1, pooledH);
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / stride + 1;
-    int pwend = min(offsetW / stride + 1, pooledW);
+
+    int frameNum = index / width / height / channels;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
+    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
-    inputData += (frameNum * channels) * height * width;
     real input = inputData[index];
     outData += (frameNum * channels + offsetC) * pooledH * pooledW;
     outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
-    targetGrad += (frameNum * channels) * height * width;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         if (input == outData[ph * pooledW + pw]) {
@@ -223,90 +237,114 @@ __global__ void KeMaxPoolBackward(int nthreads, const real* inputData,
   }
 }
 
-void hl_maxpool_backward(int frameCnt, const real* inputData,
+void hl_maxpool_backward(const int frameCnt, const real* inputData,
                         const real* outData, const real* outGrad,
-                        int channels, int height, int width,
-                        int pooledH, int pooledW, int sizeX,
-                        int stride, int start, real* targetGrad,
-                        real scaleA, real scaleB) {
-  int num_kernels = (height - start) * (width - start) * channels;
-  int blocksX = frameCnt;
-  int blocksY = (num_kernels + 1024 -1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
+                        const int channels, const int height,
+                        const int width,
+                        const int pooledH, const int pooledW,
+                        const int sizeX, const int sizeY,
+                        const int strideH, const int strideW,
+                        const int paddingH, const int paddingW,
+                        real scaleA, real scaleB,
+                        real* targetGrad) {
 
-  KeMaxPoolBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  int num_kernels = height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, outData, outGrad, channels,
-           height, width, pooledH, pooledW, sizeX, stride, start,
-           targetGrad, scaleA, scaleB);
+           height, width, pooledH, pooledW, sizeX, sizeY,
+           strideH, strideW,
+           paddingH, paddingW,
+           scaleA, scaleB,
+           targetGrad);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
-__global__ void KeAvePoolForward(int nthreads, const real* inputData,
-                                 int channels, int height, int width,
-                                 int pooledH, int pooledW, int sizeX,
-                                 int stride, int start, real* tgtData) {
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
+__global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
+                                 const int channels,
+                                 const int height, const int width,
+                                 const int pooledH, const int pooledW,
+                                 const int sizeX, const int sizeY,
+                                 const int strideH, const int strideW,
+                                 const int padH, const int padW,
+                                 real* tgtData) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
     int ph = (index / pooledW) % pooledH;
     int c = (index / pooledW / pooledH) % channels;
-    int frameNum = blockIdx.x;
-    int hstart = ph * stride + start;
-    int hend = min(hstart + sizeX, height);
-    int wstart = pw * stride + start;
-    int wend = min(wstart + sizeX, width);
+    int frameNum = index / pooledW / pooledH / channels;
+
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int hend = min(hstart + sizeY, height + padH);
+    int wend = min(wstart + sizeX, width + padW);
+    int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
-    tgtData += (frameNum * channels) * pooledH * pooledW;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         aveval += inputData[h * width + w];
       }
     }
-    tgtData[index] = aveval / ((hend - hstart) * (wend - wstart));
+    tgtData[index] = aveval / pool_size;
   }
 }
 
-void hl_avgpool_forward(int frameCnt, const real* inputData, int channels,
-                        int height, int width, int pooledH, int pooledW,
-                        int sizeX, int stride, int start, real* tgtData) {
-  int num_kernels = pooledH * pooledW * channels;
-  int blocksX = frameCnt;
-  int blocksY = (num_kernels + 1024 -1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeAvePoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+void hl_avgpool_forward(const int frameCnt, const real* inputData,
+                        const int channels,
+                        const int height, const int width,
+                        const int pooledH, const int pooledW,
+                        const int sizeX, const int sizeY,
+                        const int strideH, const int strideW,
+                        const int paddingH, const int paddingW, real* tgtData) {
+  int num_kernels = pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels,
            height, width, pooledH, pooledW,
-           sizeX, stride, start, tgtData);
+           sizeX, sizeY, strideH, strideW,
+           paddingH, paddingW, tgtData);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
-__global__ void KeAvgPoolBackward(int nthreads, const real* outGrad,
-                                  int channels, int height, int width,
-                                  int pooledH, int pooledW, int sizeX,
-                                  int stride, int start, real* tgtGrad,
-                                  real scaleA, real scaleB) {
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
+__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
+                                  const int channels, const int height,
+                                  const int width,
+                                  const int pooledH, const int pooledW,
+                                  const int sizeX, const int sizeY,
+                                  const int strideH, const int strideW,
+                                  const int padH, const int padW,
+                                  real scaleA, real scaleB,
+                                  real* tgtGrad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
-    int offsetW = index % width + start;
-    int offsetH = (index / width) % height + start;
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
     int offsetC = (index / width / height) % channels;
-    int frameNum = blockIdx.x;
-    int phstart = (offsetH < sizeX) ? 0 : (offsetH - sizeX) / stride + 1;
-    int phend = min(offsetH / stride + 1, pooledH);
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / stride + 1;
-    int pwend = min(offsetW / stride + 1, pooledW);
+    int frameNum = index / width / height / channels;
+
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
+    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
     outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
-    tgtGrad += (frameNum * channels) * height * width;
 
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
-        int poolsize = (min(ph * stride + sizeX, height) - ph * stride) *
-            (min(pw * stride + sizeX, width) - pw * stride);
+        int hstart = ph * strideH - padH;
+        int wstart = pw * strideW - padW;
+        int hend = min(hstart + sizeY, height + padH);
+        int wend = min(wstart + sizeX, width + padW);
+        int poolsize = (hend - hstart) * (wend - wstart);
         gradient += outGrad[ph * pooledW + pw]/poolsize;
       }
     }
@@ -314,20 +352,25 @@ __global__ void KeAvgPoolBackward(int nthreads, const real* outGrad,
   }
 }
 
-void hl_avgpool_backward(int frameCnt, const real* outGrad,
-                         int channels, int height, int width,
-                         int pooledH, int pooledW, int sizeX,
-                         int stride, int start, real* backGrad,
-                         real scaleA, real scaleB) {
-  int num_kernels = (height - start) * (width - start) * channels;
-  int blocksX = frameCnt;
-  int blocksY = (num_kernels + 1024 -1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
+void hl_avgpool_backward(const int frameCnt, const real* outGrad,
+                         const int channels,
+                         const int height, const int width,
+                         const int pooledH, const int pooledW,
+                         const int sizeX, const int sizeY,
+                         const int strideH, const int strideW,
+                         const int paddingH, const int paddingW,
+                         real scaleA, real scaleB,
+                         real* backGrad) {
+  int num_kernels = height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
 
-  KeAvgPoolBackward <<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, outGrad, channels, height, width,
-           pooledH, pooledW, sizeX, stride, start, backGrad, scaleA, scaleB);
+           pooledH, pooledW, sizeX, sizeY,
+           strideH, strideW,
+           paddingH, paddingW,
+           scaleA, scaleB,
+           backGrad);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 86c056ef569..4c733591b37 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -51,7 +51,6 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap,
   PoolLayer::init(layerMap, parameterMap);
 
   CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-  CHECK_EQ(start_, 0) << poolType_ << " dose not support 'start'";
 
   hl_create_tensor_descriptor(&inputDesc_);
   hl_create_tensor_descriptor(&outputDesc_);
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index df97ef2edfd..2ef94720d2b 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -56,16 +56,6 @@ public:
   void reshape(int batchSize);
   virtual void forward(PassType passType);
   virtual void backward(const UpdateCallback& callback = nullptr);
-
-  /**
-   * Calculate output size according window size of pooling.
-   */
-  int outputSize(int imageSize, int windowSize, int padding, int stride) {
-    int outputSize;
-    outputSize =
-        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
-    return outputSize;
-  }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 0ff7f374abb..7fc27ac0bd8 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -35,7 +35,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
   poolType_ = conf.pool_type();
   channels_ = conf.channels();
   sizeX_ = conf.size_x();
-  start_ = conf.start();
   stride_ = conf.stride();
   outputX_ = conf.output_x();
   imgSize_ = conf.img_size();
@@ -47,22 +46,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
 
-  bool cudnnTypeCheck = true;
-#ifndef PADDLE_ONLY_CPU
-  cudnnTypeCheck = !CudnnPoolLayer::typeCheck(poolType_);
-#endif
-
-  if ((sizeY_ != sizeX_ || imgSizeY_ != imgSize_ || strideY_ != stride_ ||
-       confPaddingY_ != confPadding_ || outputY_ != outputX_) &&
-      cudnnTypeCheck) {
-    LOG(FATAL) << poolType_ << " does not supported non-square "
-                               "filter, image, stride or padding";
-  }
-
-  if (confPadding_ != 0 && cudnnTypeCheck) {
-    LOG(FATAL) << poolType_ << " does not supported 'padding'";
-  }
-
   return true;
 }
 
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index b7a1dfd7632..bde1f5b8dcb 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
 class PoolLayer : public Layer {
 protected:
   size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int start_, confPadding_;
+  int confPadding_;
 
   size_t sizeY_;
   size_t imgSizeY_;
@@ -47,6 +47,16 @@ public:
   static Layer* create(const LayerConfig& config);
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Calculate output size according window size and padding size.
+   */
+  int outputSize(int imageSize, int windowSize, int padding, int stride) {
+    int outputSize;
+    outputSize =
+        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
+    return outputSize;
+  }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index 9c2d6d2164a..5a2e9afb6e1 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -25,13 +25,15 @@ size_t PoolProjectionLayer::getSize() {
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
   }
   if (imgSizeW_ == 0) {
     imgSizeW_ = imgSize_;
   }
-  outputH_ = 1 + (imgSizeH_ - start_ - sizeX_ + stride_ - 1) / stride_;
-  outputW_ = 1 + (imgSizeW_ - start_ - sizeX_ + stride_ - 1) / stride_;
+
+  outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_);
+  outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_);
+
   layerSize = outputH_ * outputW_ * channels_;
 
   getOutput().setFrameHeight(outputH_);
@@ -51,8 +53,9 @@ void MaxPoolProjectionLayer::forward(PassType passType) {
 
   MatrixPtr outV = getOutputValue();
 
-  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
-                       stride_, outputH_, outputW_);
+  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
+                       sizeX_, sizeY_, strideY_, stride_,
+                       outputH_, outputW_, confPaddingY_, confPadding_);
 }
 
 void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
@@ -69,7 +72,9 @@ void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inputGrad = getInputGrad(0);
 
   inputGrad->maxPoolBackward(*inputV, imgSizeH_, imgSizeW_, *outGrad, *outV,
-                             sizeX_, start_, stride_, outputH_, outputW_, 1, 1);
+                             sizeX_, sizeY_,
+                             strideY_, stride_, outputH_, outputW_, 1, 1,
+                             confPaddingY_, confPadding_);
 }
 
 void AvgPoolProjectionLayer::forward(PassType passType) {
@@ -84,8 +89,9 @@ void AvgPoolProjectionLayer::forward(PassType passType) {
 
   MatrixPtr outV = getOutputValue();
 
-  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
-                       stride_, outputH_, outputW_);
+  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
+                       sizeX_, sizeY_, strideY_, stride_,
+                       outputH_, outputW_, confPaddingY_, confPadding_);
 }
 
 void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
@@ -97,7 +103,9 @@ void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
   /* Do derivation */
   MatrixPtr outputGrad = getOutputGrad();
   MatrixPtr inputGrad = getInputGrad(0);
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_, sizeX_, start_,
-                             stride_, outputH_, outputW_, 1, 1);
+  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_,
+                             sizeX_, sizeY_, strideY_, stride_,
+                             outputH_, outputW_, 1, 1,
+                             confPaddingY_, confPadding_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/gserver/tests/img_pool_a.conf
new file mode 100644
index 00000000000..5938e761120
--- /dev/null
+++ b/paddle/gserver/tests/img_pool_a.conf
@@ -0,0 +1,46 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=8,stride=1)
+maxpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         img_width=16,
+                         pool_type=MaxPooling(),
+)
+avgpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         img_width=16,
+                         pool_type=AvgPooling(),
+)
+
+outputs([maxpool, avgpool])
diff --git a/paddle/gserver/tests/img_pool_b.conf b/paddle/gserver/tests/img_pool_b.conf
new file mode 100644
index 00000000000..6ea9649b3f1
--- /dev/null
+++ b/paddle/gserver/tests/img_pool_b.conf
@@ -0,0 +1,44 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8, num_filters=8, stride=1)
+maxpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         pool_type=CudnnMaxPooling(),
+)
+
+avgpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         pool_type=CudnnAvgPooling(),
+)
+
+outputs([maxpool, avgpool])
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 3150c31e490..c5723f8574a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -791,21 +791,24 @@ void setPoolConfig(TestConfig* config, PoolConfig* pool,
   (*config).biasSize = 0;
   (*config).layerConfig.set_type("pool");
   (*config).layerConfig.set_num_filters(16);
-  (*config).layerConfig.set_partial_sum(1);
-  (*config).layerConfig.set_shared_biases(true);
 
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
   pool->set_pool_type(poolType);
   pool->set_channels(16);
-  pool->set_size_x(3);
-  if (poolType == "cudnn-max-pool" || poolType == "cudnn-avg-pool") {
-    pool->set_padding(0);
-  } else {
-    pool->set_start(0);
-  }
-  pool->set_stride(2);
-  pool->set_output_x((pool->img_size() - pool->start() - pool->size_x()) /
-                         ((float)pool->stride()) +
-                     1.5);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = (pool->img_size() - kw + 2 * pw + sw - 1) / sw + 1;
+  int oh = (pool->img_size_y() - kh + 2 * ph + sh - 1) / sh + 1;
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
 }
 
 void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
@@ -814,9 +817,10 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   LayerInputConfig* input = config.layerConfig.add_inputs();
   PoolConfig* pool = input->mutable_pool_conf();
 
-  setPoolConfig(&config, pool, poolType);
   pool->set_img_size(14);
-  config.layerConfig.set_size(pool->output_x() * pool->output_x() *
+  pool->set_img_size_y(14);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
 
   testLayerGrad(config, "pool", 100, trans, useGpu);
@@ -829,11 +833,11 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   LayerInputConfig* input = config.layerConfig.add_inputs();
   PoolConfig* pool = input->mutable_pool_conf();
 
-  setPoolConfig(&config, pool, poolType);
   pool->set_size_y(4);
   pool->set_stride_y(3);
   pool->set_img_size(10);
   pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
   pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
                          ((float)pool->stride_y()) +
                      1.5);
@@ -1252,8 +1256,6 @@ TEST(Layer, MultiplexLayer) {
   }
 }
 
-
-
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 1c6a8b0017f..b3ef5306730 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -116,6 +116,8 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   gradientMachine->start(trainer.getConfig(), nullptr);
   gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
   for (size_t i = 0; i < in.outGrads.size(); i++) {
+    // If the all the layers in the config have no parameters, also
+    // not set NeedGradient(), the outArgs[i] will be nullptr.
     outArgs[i].grad->copyFrom(*in.outGrads[i]);
   }
   gradientMachine->backward();
@@ -225,6 +227,18 @@ TEST(Compare, concat_table) {
   compareNetwork(config_file_a, config_file_b);
 }
 
+#ifndef PADDLE_ONLY_CPU
+TEST(Compare, img_pool) {
+  std::string config_file_a = "./gserver/tests/img_pool_a.conf";
+  std::string config_file_b = "./gserver/tests/img_pool_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+#endif
+
+
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index e351bede724..a6ff2f3b35d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -860,9 +860,11 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
 }
 
 void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               int start, size_t stride, size_t outputH,
-                               size_t outputW) {
+                               size_t imgSizeW, size_t channels,
+                               size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               size_t paddingH, size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -874,14 +876,17 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(width_ == outputH * outputW * channels);
 
   hl_maxpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, stride, start, data_);
+                     outputH, outputW, sizeX, sizeY, strideH, strideW,
+                     paddingH, paddingW, data_);
 }
 
 void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
                                 size_t imgSizeW, Matrix& outGrad, Matrix& outV,
-                                size_t sizeX, int start, size_t stride,
+                                size_t sizeX, size_t sizeY,
+                                size_t strideH, size_t strideW,
                                 size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput) {
+                                real scaleTargets, real scaleOutput,
+                                size_t paddingH, size_t paddingW) {
   CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
         outV.useGpu_ == true)
       << "Matrix type are not equal";
@@ -899,15 +904,19 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
+
   hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels,
-                      height, width, outputH, outputW, sizeX, stride,
-                      start, data_, scaleTargets, scaleOutput);
+                      height, width, outputH, outputW, sizeX, sizeY,
+                      strideH, strideW, paddingH, paddingW,
+                      scaleTargets, scaleOutput, data_);
 }
 
 void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               int start, size_t stride, size_t outputH,
-                               size_t outputW) {
+                               size_t imgSizeW, size_t channels,
+                               size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               size_t paddingH, size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -919,13 +928,17 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(width_ == outputH * outputW * channels);
 
   hl_avgpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, stride, start, data_);
+                     outputH, outputW, sizeX, sizeY,
+                     strideH, strideW,
+                     paddingH, paddingW, data_);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
-                                size_t imgSizeW, size_t sizeX, int start,
-                                size_t stride, size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput) {
+                                size_t imgSizeW, size_t sizeX, size_t sizeY,
+                                size_t strideH, size_t strideW,
+                                size_t outputH, size_t outputW,
+                                real scaleTargets, real scaleOutput,
+                                size_t paddingH, size_t paddingW) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -938,8 +951,10 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
   hl_avgpool_backward(frameNum, outDiff, channels, height, width,
-                      outputH, outputW, sizeX, stride, start, data_,
-                      scaleTargets, scaleOutput);
+                      outputH, outputW, sizeX, sizeY,
+                      strideH, strideW, paddingH, paddingW,
+                      scaleTargets, scaleOutput,
+                      data_);
 }
 
 void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
@@ -1450,19 +1465,23 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
 }
 
 void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               int start, size_t stride, size_t outputH,
-                               size_t outputW) {
+                               size_t imgSizeW, size_t channels,
+                               size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               size_t paddingH, size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = data_;
   size_t num = inputMat.getHeight();
   size_t inWidth = imgSizeW;
   size_t inHeight = imgSizeH;
   CHECK(inHeight * inWidth == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels*outputH*outputW, this->getWidth());
 
   /* initialize the data_ */
   for (size_t i = 0; i < height_ * width_; i++) {
-    data_[i] = -FLT_MAX;
+    outData[i] = -(real)FLT_MAX;
   }
 
   /* pool max one by one */
@@ -1470,12 +1489,14 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
-          size_t hstart = ph * stride + start;
-          size_t wstart = pw * stride + start;
-          size_t hend = std::min(hstart + sizeX, inHeight);
-          size_t wend = std::min(wstart + sizeX, inWidth);
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
+          int hstart = ph * strideH - paddingH;
+          int wstart = pw * strideW - paddingW;
+          int hend = std::min(hstart + sizeY, inHeight);
+          int wend = std::min(wstart + sizeX, inWidth);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
               outData[ph * outputW + pw] = std::max(outData[ph * outputW + pw],
                                                     inputData[h * inWidth + w]);
             }
@@ -1491,9 +1512,10 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
 
 void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
                                 Matrix& outGrad, Matrix& outV, size_t sizeX,
-                                int start, size_t stride, size_t outputH,
-                                size_t outputW, real scaleTargets,
-                                real scaleOutput) {
+                                size_t sizeY, size_t strideH, size_t strideW,
+                                size_t outputH, size_t outputW,
+                                real scaleTargets, real scaleOutput,
+                                size_t paddingH, size_t paddingW) {
   size_t num = image.getHeight();
   size_t channels = size_t(width_ / imgSizeH / imgSizeW);
   CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
@@ -1509,32 +1531,36 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
-          size_t hstart = ph * stride + start;
-          size_t wstart = pw * stride + start;
-          size_t hend = std::min(hstart + sizeX, imgSizeH);
-          size_t wend = std::min(wstart + sizeX, imgSizeW);
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
+          int hstart = ph * strideH - paddingH;
+          int wstart = pw * strideW - paddingW;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
               tgtGrad[h * imgSizeW + w] =
                   scaleTargets * tgtGrad[h * imgSizeW + w] +
                   scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputH + pw]);
+                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
             }
           }
         }
       }
       // offset
       inData += imgSizeH * imgSizeW;
-      otData += outputH * outputW;
       tgtGrad += imgSizeH * imgSizeW;
+      otData += outputH * outputW;
       otGrad += outputH * outputW;
     }
   }
 }
 
 void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t channels, size_t sizeX, int start,
-                               size_t stride, size_t outputH, size_t outputW) {
+                               size_t channels, size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               size_t paddingH, size_t paddingW) {
   // The main loop
   size_t num = input.getHeight();
   size_t inHeight = imgSizeH;
@@ -1548,17 +1574,24 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
-          size_t hstart = ph * stride + start;
-          size_t wstart = pw * stride + start;
-          size_t hend = std::min(hstart + sizeX, inHeight);
-          size_t wend = std::min(wstart + sizeX, inWidth);
+          int hstart = ph * strideH - paddingH;
+          int wstart = pw * strideW - paddingW;
+          int hend = std::min(hstart + sizeY, inHeight + paddingH);
+          int wend = std::min(wstart + sizeX, inWidth + paddingW);
+          int poolSize = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, static_cast<int>(inHeight));
+          wend = std::min(wend, static_cast<int>(inWidth));
+
+          CHECK(poolSize);
           tgtData[ph * outputW + pw] = 0;  // clear
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
               tgtData[ph * outputW + pw] += inData[h * inWidth + w];
             }
           }
-          tgtData[ph * outputW + pw] /= (hend - hstart) * (wend - wstart);
+          tgtData[ph * outputW + pw] /= poolSize;
         }
       }
       // compute offset
@@ -1569,9 +1602,11 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
 }
 
 void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                                size_t sizeX, int start, size_t stride,
+                                size_t sizeX, size_t sizeY,
+                                size_t strideH, size_t strideW,
                                 size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput) {
+                                real scaleTargets, real scaleOutput,
+                                size_t paddingH, size_t paddingW) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   CHECK(imgSizeH * imgSizeW * channels == getWidth());
@@ -1582,14 +1617,20 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
-          size_t hstart = ph * stride + start;
-          size_t wstart = pw * stride + start;
-          size_t hend = std::min(hstart + sizeX, imgSizeH);
-          size_t wend = std::min(wstart + sizeX, imgSizeW);
-          size_t poolsize = (hend - hstart) * (wend - wstart);
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolsize;
+          int hstart = ph * strideH - paddingH;
+          int wstart = pw * strideW - paddingW;
+          int hend = std::min(hstart + sizeY, imgSizeH + paddingH);
+          int wend = std::min(wstart + sizeX, imgSizeW + paddingW);
+          int poolSize = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, static_cast<int>(imgSizeH));
+          wend = std::min(wend, static_cast<int>(imgSizeW));
+          CHECK(poolSize);
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
             }
           }
         }
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index cfb30797fcf..5c15c940128 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -742,31 +742,37 @@ public:
    */
   virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
                               size_t imgSizeW, size_t channels, size_t sizeX,
-                              int start_, size_t stride, size_t outputH,
-                              size_t outputW) {
+                              size_t sizeY, size_t strideH, size_t strideW,
+                              size_t outputH, size_t outputW,
+                              size_t paddingH, size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling backward operation.
   virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
                                Matrix& outGrad, Matrix& outV, size_t sizeX,
-                               int start, size_t stride, size_t outputH,
-                               size_t outputW, real scaleTargets,
-                               real scaleOutput) {
+                               size_t sizeY, size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               real scaleTargets, real scaleOutput,
+                               size_t paddingH, size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling forward operation, caculate the average of sizeX elements.
   virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                              size_t channels, size_t sizeX, int start,
-                              size_t stride, size_t outputH, size_t outputW) {
+                              size_t channels, size_t sizeX, size_t sizeY,
+                              size_t strideH, size_t strideW,
+                              size_t outputH, size_t outputW,
+                              size_t paddingH, size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t sizeX, int start, size_t stride,
+                               size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
                                size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput) {
+                               real scaleTargets, real scaleOutput,
+                               size_t paddingH, size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -1131,21 +1137,30 @@ public:
                   real alpha = 1.0f, real beta = 0.0f);
 
   void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start_, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);
 
   void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
-                       size_t stride, size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput);
+                       Matrix& outGrad, Matrix& outV, size_t sizeX,
+                       size_t sizeY, size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);
 
   void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);
 
   void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, int start, size_t stride, size_t outputH,
-                       size_t outputW, real scaleTargets, real scaleOutput);
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);
 
   void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                          Matrix& denoms, size_t channels, size_t sizeX,
@@ -1242,21 +1257,31 @@ public:
                   real alpha = 1.0f, real beta = 0.0f);
 
   void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start_, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);
 
   void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
-                       size_t stride, size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput);
+                       Matrix& outGrad, Matrix& outV,
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);
 
   void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);
 
   void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, int start, size_t stride, size_t outputH,
-                       size_t outputW, real scaleTargets, real scaleOutput);
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);
 
   void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                          Matrix& denoms, size_t channels, size_t sizeX,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index fe8eacc2efb..e1bda79a8ac 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1846,6 +1846,159 @@ TEST(Matrix, classificationError) {
   }
 }
 
+void testMaxPoolFwdBwd(int numSamples, int channels,
+                       int imgSizeH, int imgSizeW,
+                       int ksizeH, int ksizeW,
+                       int strideH, int strideW,
+                       int padH, int padW) {
+  int outH = 0, outW = 0;
+  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPoolForward(*input, imgSizeH, imgSizeW,
+                         channels, ksizeW, ksizeH,
+                         strideH, strideW, outH, outW, padH, padW);
+  targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW,
+                            channels, ksizeW, ksizeH,
+                            strideH, strideW, outH, outW, padH, padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
+                                              false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW,
+                             *targetGrad, *target,
+                             ksizeW, ksizeH,
+                             strideH, strideW,
+                             outH, outW, 1.0, 1.0, padH, padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW,
+                                *targetGpuGrad, *targetGpu,
+                                ksizeW, ksizeH,
+                                strideH, strideW,
+                                outH, outW, 1.0, 1.0, padH, padW);
+  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
+                                               false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPoolFwdBwd(int numSamples, int channels,
+                       int imgSizeH, int imgSizeW,
+                       int ksizeH, int ksizeW,
+                       int strideH, int strideW,
+                       int padH, int padW) {
+  int outH = 0, outW = 0;
+  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPoolForward(*input, imgSizeH, imgSizeW,
+                         channels, ksizeW, ksizeH,
+                         strideH, strideW, outH, outW, padH, padW);
+  targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW,
+                            channels, ksizeW, ksizeH,
+                            strideH, strideW, outH, outW, padH, padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
+                                              false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW,
+                             ksizeW, ksizeH,
+                             strideH, strideW,
+                             outH, outW, 1.0, 1.0, padH, padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW,
+                                ksizeW, ksizeH,
+                                strideH, strideW,
+                                outH, outW, 1.0, 1.0, padH, padW);
+  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
+                                               false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+}
+
+TEST(Matrix, PoolFwdBwd) {
+  for (auto numSamples : {5, 32}) {
+    for (auto channels : {1, 9, 32}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto sizeX : {2, 5}) {
+            for (auto sizeY : {2, 5}) {
+              for (auto sH : {1, 2}) {
+                for (auto sW : {1, 2}) {
+                   for (auto pH : {0, (sizeY - 1)/2}) {
+                     for (auto pW : {0, (sizeX - 1)/2}) {
+                       VLOG(3) << " numSamples=" << numSamples
+                               << " channels=" << channels
+                               << " imgSizeH=" << imgSizeH
+                               << " imgSizeW=" << imgSizeW
+                               << " sizeX=" << sizeX
+                               << " sizeY=" << sizeY
+                               << " strideH=" << sH
+                               << " strideW=" << sW
+                               << " padingH=" << pH
+                               << " padingW=" << pW;
+                       testMaxPoolFwdBwd(numSamples, channels, imgSizeH,
+                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
+                       testAvgPoolFwdBwd(numSamples, channels, imgSizeH,
+                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
+                     }
+                   }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index b32f8b1ee90..25e36f9c4c1 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -88,7 +88,8 @@ message PoolConfig {
   required uint32 size_x = 3;
 
   // Tell the net where in the input image to start the pooling.
-  required uint32 start = 4;
+  // start is deprecated now.
+  optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
   required uint32 stride = 5;
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 1f55298f24f..fb47fd0c6f0 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -961,10 +961,6 @@ def parse_pool(pool, input_layer_name, pool_conf):
                   "['max-projection', 'avg-projection', "
                   "'cudnn-max-pool', 'cudnn-avg-pool']"
                   % pool.pool_type)
-    if pool.size_y or pool.stride_y or pool.img_width or pool.padding_y:
-        config_assert(pool.pool_type.startswith('cudnn'),
-                      "'size_y', 'stride_y' and 'img_width' and 'padding_y'"
-                      "can only be used for cudnn")
 
     pool_conf.channels = pool.channels
     pool_conf.size_x = pool.size_x
@@ -974,36 +970,25 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.stride_y = default(pool.stride_y, pool_conf.stride);
 
     img_pixels = g_layer_map[input_layer_name].size / pool.channels
+    # the img_width may be removed,
+    # and it can be calculated automatically later.
     pool_conf.img_size = default(pool.img_width, int(img_pixels ** 0.5))
     pool_conf.img_size_y = img_pixels / pool_conf.img_size
     config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
                   "Incorrect input image size %d for input image pixels %d"
                   % (pool_conf.img_size, img_pixels))
 
-    if pool.start is not None:
-        config_assert(pool.padding is None,
-              'At most one of start and padding can be set.')
-        pool_conf.start = pool.start
-        pool_conf.padding = 0
-        pool_conf.output_x = int(math.ceil((pool_conf.img_size - \
-            pool_conf.start - pool_conf.size_x) / \
-            float(pool_conf.stride))) + 1
+    config_assert(not pool.start, "start is deprecated in pooling.")
 
-        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y - \
-            pool_conf.start - pool_conf.size_y) / \
-            float(pool_conf.stride_y))) + 1
-    elif pool.padding is not None:
+    if pool.padding is not None:
         pool_conf.padding = pool.padding
         pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-        pool_conf.start = 0
         pool_conf.output_x = int(math.ceil((pool_conf.img_size + \
             2*pool_conf.padding - pool_conf.size_x) / \
             float(pool_conf.stride))) + 1
         pool_conf.output_y = int(math.ceil((pool_conf.img_size_y + \
             2*pool_conf.padding_y - pool_conf.size_y) / \
             float(pool_conf.stride_y))) + 1
-    else:
-        raise ValueError('At least one of start and padding should be set.')
 
 def parse_image(image, input_layer_name, image_conf):
     image_conf.channels = image.channels
@@ -1603,7 +1588,7 @@ class PoolLayer(LayerBase):
             pool_conf = self.config.inputs[input_index].pool_conf
             print("output size for %s is %d*%d " % (
                 name, pool_conf.output_y, pool_conf.output_x))
-            self.set_layer_size((pool_conf.output_x ** 2) * pool_conf.channels)
+            self.set_layer_size((pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
 
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 47db197f422..5e7e66a908e 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -210,7 +210,7 @@ DEVICE = 'device'
 
 
 def layer_support(*attrs):
-    attrs_list = list(attrs) 
+    attrs_list = list(attrs)
     attrs_list.append(DEVICE)
     def decorator(method):
         @functools.wraps(method)
@@ -1627,7 +1627,9 @@ def img_conv_layer(input, filter_size, num_filters,
 @layer_support()
 def img_pool_layer(input, pool_size, name=None,
                    num_channels=None, pool_type=None,
-                   stride=1, start=None, padding=0, layer_attr=None):
+                   stride=1, start=None, padding=0, layer_attr=None,
+                   pool_size_y=None, stride_y=None, padding_y=None,
+                   img_width=None):
     """
     Image pooling Layer.
 
@@ -1635,25 +1637,34 @@ def img_pool_layer(input, pool_size, name=None,
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
-    :param padding: pooling padding
+    :param padding: pooling padding width.
     :type padding: int
+    :param padding_y: pooling padding height. It's equal to padding by default.
+    :type padding_y: int|None
     :param name: name of pooling layer
     :type name: basestring.
     :param input: layer's input
     :type input: LayerOutput
-    :param pool_size: pooling size
+    :param pool_size: pooling window width
     :type pool_size: int
+    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
+    :type pool_size_y: int|None
     :param num_channels: number of input channel.
     :type num_channels: int
     :param pool_type: pooling type. MaxPooling or AveragePooling. Default is
                       MaxPooling.
     :type pool_type: BasePoolingType
-    :param stride: stride of pooling.
+    :param stride: stride width of pooling.
     :type stride: int
-    :param start: start position of pooling operation.
-    :type start: int
+    :param stride_y: stride height of pooling. It is equal to stride by default.
+    :type stride_y: int|None
+    :param start: start position of pooling operation. Note it is deprecated now.
+    :type start: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param img_width: the width of input feature map. If it is None, the input feature
+                      map should be square.
+    :type img_width: int|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1666,17 +1677,29 @@ def img_pool_layer(input, pool_size, name=None,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
+    type_name = pool_type.name + '-projection' \
+      if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+      else pool_type.name
+
+    pool_size_y = pool_size if pool_size_y is None else pool_size_y
+    stride_y = stride if stride_y is None else stride_y
+    padding_y = padding if padding_y is None else padding_y
+
     Layer(
         name=name,
         type=LayerType.POOL_LAYER,
         inputs=[Input(input.name,
                       pool=Pool(
-                          pool_type=''.join([pool_type.name, '-projection']),
+                          pool_type=type_name,
                           channels=num_channels,
                           size_x=pool_size,
                           start=start,
                           stride=stride,
-                          padding=padding
+                          padding=padding,
+                          size_y=pool_size_y,
+                          stride_y=stride_y,
+                          padding_y=padding_y,
+                          img_width=img_width
                       ))],
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
@@ -2751,7 +2774,7 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
     tmp = recurrent_group(step=__real_step__, input=real_input, reverse=False,
                           name=name)
-    
+
     return tmp
 
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index e59e93acbe3..ab4057d9d6c 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -170,13 +170,13 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
     :type shared_bias: bool
     :param conv_layer_attr: see img_conv_layer for details
     :type conv_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_conv_layer for details
+    :param pool_stride: see img_pool_layer for details
     :type pool_stride: int
-    :param pool_start: see img_conv_layer for details
+    :param pool_start: see img_pool_layer for details. It is deprecated now.
     :type pool_start: int
-    :param pool_padding: see img_conv_layer for details
+    :param pool_padding: see img_pool_layer for details
     :type pool_padding: int
-    :param pool_layer_attr: see img_conv_layer for details
+    :param pool_layer_attr: see img_pool_layer for details
     :type pool_layer_attr: ExtraLayerAttribute
     :return: Layer's output
     :rtype: LayerOutput
@@ -243,7 +243,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
     :param bn_layer_attr: ParameterAttribute.
     :param pool_stride: see img_pool_layer's document.
     :type pool_stride: int
-    :param pool_start: see img_pool_layer's document.
+    :param pool_start: see img_pool_layer's document. It is deprecated now.
     :type pool_start: int
     :param pool_padding: see img_pool_layer's document.
     :type pool_padding: int
@@ -555,7 +555,7 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None,
     :type gate_act: BaseActivation
     :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+    :param mixed_bias_attr: bias parameter attribute of mixed layer.
                             False means no bias, None means default bias.
     :type mixed_bias_attr: ParameterAttribute|False
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index d627daab0c4..3d2320f3ffc 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -19,6 +19,8 @@ __all__ = [
     "BasePoolingType",
     "MaxPooling",
     "AvgPooling",
+    "CudnnMaxPooling",
+    "CudnnAvgPooling",
     "SumPooling",
     "SquareRootNPooling"
 ]
@@ -26,7 +28,7 @@ __all__ = [
 
 class BasePoolingType(object):
     """
-    Base Pooling Type. 
+    Base Pooling Type.
     Note these pooling types are used for sequence input, not for images.
     Each PoolingType contains one parameter:
 
@@ -55,7 +57,24 @@ class MaxPooling(BasePoolingType):
     def __init__(self, output_max_index=None):
         BasePoolingType.__init__(self, "max")
         self.output_max_index = output_max_index
-        
+
+
+class CudnnMaxPooling(BasePoolingType):
+    """
+    Cudnn max pooling only support GPU. Return the maxinum value in the
+    pooling window.
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-max-pool")
+
+
+class CudnnAvgPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window.
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 class AvgPooling(BasePoolingType):
     """
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
index 29928b6f7b4..359652f3d09 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -1,4 +1,4 @@
-7e6919d17562516e9a1d9a88de1fb3b9  img_layers.protostr
+86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
 5913f87b39cee3b2701fa158270aca26  projections.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index 6c8ba8be846..f33357c3906 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -7,8 +7,10 @@ settings(
 
 img = data_layer(name='image', size=256*256)
 
+# the parse_conv in config_parse.py is not strictly accurate when filter_size
+# is not square. So here set square filter_size.
 img_conv = img_conv_layer(input=img, num_channels=1, num_filters=64,
-                          filter_size=(32, 64), padding=(1, 0), stride=(1, 1),
+                          filter_size=(32, 32), padding=(1, 1), stride=(1, 1),
                           act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
 
@@ -17,4 +19,4 @@ img_norm = img_cmrnorm_layer(input=img_bn, size=32)
 img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
 
 
-outputs(img_pool, img_norm)
\ No newline at end of file
+outputs(img_pool, img_norm)
-- 
GitLab