diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 84f1c843596d4c871959594595c577eeb916c6f9..6b56d9ec8d3daae96aaaa04ed79cb637331e2281 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -192,11 +192,10 @@ extern void hl_maxpool3D_forward(const int frameCnt,
                                  const int paddingH,
                                  const int paddingW,
                                  real* tgtData,
+                                 real* maxPoolIdxData,
                                  const int tgtStride);
 
 extern void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* inputData,
-                                  const real* outData,
                                   const real* outGrad,
                                   const int channels,
                                   const int depth,
@@ -217,6 +216,7 @@ extern void hl_maxpool3D_backward(const int frameCnt,
                                   real scaleA,
                                   real scaleB,
                                   real* targetGrad,
+                                  real* maxPoolIdxData,
                                   const int outStride);
 
 extern void hl_avgpool3D_forward(const int frameCnt,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 6750f537bfbf60309eb10761676d712c6508dff3..a76dbf0b6578de0606702ad1af227fbf6e1cd62e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -106,11 +106,10 @@ inline void hl_maxpool3D_forward(const int frameCnt,
                                  const int paddingH,
                                  const int paddingW,
                                  real* tgtData,
+                                 real* maxPoolIdxData,
                                  const int tgtStride) {}
 
 inline void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* inputData,
-                                  const real* outData,
                                   const real* outGrad,
                                   const int channels,
                                   const int depth,
@@ -131,6 +130,7 @@ inline void hl_maxpool3D_backward(const int frameCnt,
                                   real scaleA,
                                   real scaleB,
                                   real* targetGrad,
+                                  real* maxPoolIdxData,
                                   const int outStride) {}
 
 inline void hl_avgpool3D_forward(const int frameCnt,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 458c347728952566a87a075d496149ec2dbeb9f0..95440c9446384009789431013725085358c107c5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -366,10 +366,11 @@ __global__ void KeMaxPool3DForward(const int nthreads,
                                    const int strideD,
                                    const int strideH,
                                    const int strideW,
-                                   const int offsetD,
-                                   const int offsetH,
-                                   const int offsetW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
                                    real* tgtData,
+                                   real* maxPoolIdxData,
                                    const int tgtStride) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
        index += blockDim.x * gridDim.x) {
@@ -378,9 +379,9 @@ __global__ void KeMaxPool3DForward(const int nthreads,
     int pd = (index / pooledW / pooledH) % pooledD;
     int c = (index / pooledW / pooledH / pooledD) % channels;
     int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - offsetD;
-    int hstart = ph * strideH - offsetH;
-    int wstart = pw * strideW - offsetW;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
     int dend = min(dstart + ksizeD, depth);
     int hend = min(hstart + ksizeH, height);
     int wend = min(wstart + ksizeW, width);
@@ -388,18 +389,22 @@ __global__ void KeMaxPool3DForward(const int nthreads,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int maxIdx = -1;
     inputData += (frameNum * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          if (maxval < inputData[(d * height + h) * width + w])
+          if (maxval < inputData[(d * height + h) * width + w]) {
             maxval = inputData[(d * height + h) * width + w];
+            maxIdx = (d * height + h) * width + w;
+          }
         }
       }
     }
     int tgtIndex =
         index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    maxPoolIdxData[tgtIndex] = maxIdx;
   }
 }
 
@@ -418,10 +423,11 @@ void hl_maxpool3D_forward(const int frameCnt,
                           const int strideD,
                           const int strideH,
                           const int strideW,
-                          const int paddingD,
-                          const int paddingH,
-                          const int paddingW,
+                          const int padD,
+                          const int padH,
+                          const int padW,
                           real* tgtData,
+                          real* maxPoolIdxData,
                           const int tgtStride) {
   int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -443,17 +449,16 @@ void hl_maxpool3D_forward(const int frameCnt,
                                                            strideD,
                                                            strideH,
                                                            strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
+                                                           padD,
+                                                           padH,
+                                                           padW,
                                                            tgtData,
+                                                           maxPoolIdxData,
                                                            tgtStride);
   CHECK_SYNC("hl_maxpool3D_forward failed");
 }
 
 __global__ void KeMaxPool3DBackward(const int nthreads,
-                                    const real* inputData,
-                                    const real* outData,
                                     const real* outGrad,
                                     const int channels,
                                     const int depth,
@@ -474,33 +479,35 @@ __global__ void KeMaxPool3DBackward(const int nthreads,
                                     real scaleA,
                                     real scaleB,
                                     real* targetGrad,
+                                    real* maxPoolIdxData,
                                     const int outStride) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
        index += blockDim.x * gridDim.x) {
-    // find out the local index
-    // find out the local offset
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetD = (index / width / height) % depth + padD;
+    int offsetW = index % width;
+    int offsetH = (index / width) % height;
+    int offsetD = (index / width / height) % depth;
     int offsetC = (index / width / height / depth) % channels;
     int frameNum = index / width / height / depth / channels;
 
-    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int pdend = min(offsetD / strideD + 1, pooledD);
-    int phend = min(offsetH / strideH + 1, pooledH);
-    int pwend = min(offsetW / strideW + 1, pooledW);
+    int pdstart =
+        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
+    int phstart =
+        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
+    int pwstart =
+        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
+    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
+    int phend = min((offsetH + padH) / strideH + 1, pooledH);
+    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
 
     real gradient = 0;
-    real input = inputData[index];
-
-    outData += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
     outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    maxPoolIdxData +=
+        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
-          if (input == outData[(pd * pooledH + ph) * pooledW + pw])
+          if (((offsetD * height + offsetH) * width + offsetW) ==
+              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
             gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
         }
       }
@@ -510,8 +517,6 @@ __global__ void KeMaxPool3DBackward(const int nthreads,
 }
 
 void hl_maxpool3D_backward(const int frameCnt,
-                           const real* inputData,
-                           const real* outData,
                            const real* outGrad,
                            const int channels,
                            const int depth,
@@ -532,13 +537,12 @@ void hl_maxpool3D_backward(const int frameCnt,
                            real scaleA,
                            real scaleB,
                            real* targetGrad,
+                           real* maxPoolIdxData,
                            const int outStride) {
   int num_kernels = depth * height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
   KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           inputData,
-                                                           outData,
                                                            outGrad,
                                                            channels,
                                                            depth,
@@ -559,6 +563,7 @@ void hl_maxpool3D_backward(const int frameCnt,
                                                            scaleA,
                                                            scaleB,
                                                            targetGrad,
+                                                           maxPoolIdxData,
                                                            outStride);
   CHECK_SYNC("hl_maxpool3D_backward");
 }
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
index 40a913ebfc6681bae1ac25edf39776b6dd781e11..199f21adb1a5923b590e4f0e716fc67effb2a2d1 100644
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -72,9 +72,10 @@ size_t Pool3DLayer::getSize() {
 void Pool3DLayer::forward(PassType passType) {
   Layer::forward(passType);
   const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  int batchSize = inMat->getHeight();
-  int outWidth = getSize();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
   resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
   const MatrixPtr outMat = getOutputValue();
 
   if (poolType_ == "avg") {
@@ -97,6 +98,7 @@ void Pool3DLayer::forward(PassType passType) {
                              paddingW_);
   } else if (poolType_ == "max") {
     outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
                              channels_,
                              imgSizeD_,
                              imgSizeH_,
@@ -149,9 +151,8 @@ void Pool3DLayer::backward(const UpdateCallback& callback) {
                                  1.0,
                                  1.0);
   } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*inMat,
-                                 *outGradMat,
-                                 *outMat,
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
                                  imgSizeD_,
                                  imgSizeH_,
                                  imgSizeW_,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 4f9216896c0ebd9357ce744f57057686bb6d2a32..54c2eae475047852c3303c4f9a626975a5f9b384 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1191,6 +1191,7 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
                                  size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
@@ -1210,6 +1211,7 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat,
   CHECK(inputMat.useGpu_) << "Matrix type are not correct";
 
   real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
   size_t num = inputMat.getHeight();
   size_t width = imgSizeW;
   size_t height = imgSizeH;
@@ -1237,12 +1239,12 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat,
                        paddingH,
                        paddingW,
                        getData(),
+                       maxPoolIdxData,
                        getStride());
 }
 
-void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
@@ -1260,26 +1262,21 @@ void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
                                   size_t paddingW,
                                   real scaleTargets,
                                   real scaleOutput) {
-  CHECK(inputMat.useGpu_ && outGrad.useGpu_ && outV.useGpu_)
-      << "Matrix type are not equal";
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
 
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
   real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputD / outputH / outputW;
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
   size_t width = imgSizeW;
   size_t height = imgSizeH;
   size_t depth = imgSizeD;
-  CHECK(depth * height * width * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
+  CHECK(depth * height * width * channels == getWidth());
   CHECK(width_ == depth * width * height * channels);
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
 
   hl_maxpool3D_backward(frameNum,
-                        inputData,
-                        outData,
                         outDiff,
                         channels,
                         depth,
@@ -1300,6 +1297,7 @@ void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
                         scaleTargets,
                         scaleOutput,
                         getData(),
+                        maxPoolIdxData,
                         outGrad.getStride());
 }
 
@@ -2148,6 +2146,7 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
 }
 
 void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
                                  size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
@@ -2166,6 +2165,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
                                  size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
   size_t num = inputMat.getHeight();
   size_t inWidth = imgSizeW;
   size_t inHeight = imgSizeH;
@@ -2179,6 +2179,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
       outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
     }
   }
 
@@ -2186,6 +2187,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
   for (size_t n = 0; n < num; ++n) {  // frame by frame
     if (!isContiguous()) {
       outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t pd = 0; pd < outputD; ++pd) {
@@ -2200,6 +2202,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
+            int maxIdx = -1;
             real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -2207,24 +2210,26 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
                   if (maxOutData <
                       inputData[(d * inHeight + h) * inWidth + w]) {
                     maxOutData = inputData[(d * inHeight + h) * inWidth + w];
+                    maxIdx = (d * inHeight + h) * inWidth + w;
                   }
                 }
               }
             }
             outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
           }
         }
       }
       // compute offset
       inputData += inDepth * inHeight * inWidth;
       outData += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
     }
   }
 }
 
-void CpuMatrix::maxPool3DBackward(Matrix& image,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
@@ -2242,59 +2247,38 @@ void CpuMatrix::maxPool3DBackward(Matrix& image,
                                   size_t paddingW,
                                   real scaleTargets,
                                   real scaleOutput) {
-  size_t num = image.getHeight();
+  size_t num = getHeight();
   size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
-  CHECK(image.getWidth() == imgSizeD * imgSizeH * imgSizeW * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
 
   real* tgtGrad = getData();
-  real* inData = image.getData();
-  real* otData = outV.getData();
   real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
 
-  size_t outStride = outV.getStride();
+  size_t outStride = outGrad.getStride();
   ;
 
   for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = outV.getData() + n * outStride;
+    if (!outGrad.isContiguous()) {
       otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t pd = 0; pd < outputD; ++pd) {
         for (size_t ph = 0; ph < outputH; ++ph) {
           for (size_t pw = 0; pw < outputW; ++pw) {
-            int dstart = pd * strideD - paddingD;
-            int hstart = ph * strideH - paddingH;
-            int wstart = pw * strideW - paddingW;
-            int dend = std::min(dstart + sizeZ, imgSizeD);
-            int hend = std::min(hstart + sizeY, imgSizeH);
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            dstart = std::max(dstart, 0);
-            hstart = std::max(hstart, 0);
-            wstart = std::max(wstart, 0);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtGrad[(d * imgSizeH + h) * imgSizeW + w] =
-                      scaleTargets *
-                          tgtGrad[(d * imgSizeH + h) * imgSizeW + w] +
-                      scaleOutput * otGrad[(pd * outputH + ph) * outputW + pw] *
-                          (inData[(d * imgSizeH + h) * imgSizeW + w] ==
-                           otData[(pd * outputH + ph) * outputW + pw]);
-                }
-              }
-            }
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
           }
         }
       }
       // offset
-      inData += imgSizeD * imgSizeH * imgSizeW;
       tgtGrad += imgSizeD * imgSizeH * imgSizeW;
-      otData += outputD * outputH * outputW;
       otGrad += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
     }
   }
 }
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index dec9702433cdcc3f4cc237ebf382960a166d5968..e674c1e9abef2e27efb81f894d101ddd573b656f 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -933,6 +933,7 @@ public:
    * in the sizeX of value
    */
   virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
                                 size_t channels,
                                 size_t imgSizeD,
                                 size_t imgSizeH,
@@ -952,9 +953,8 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void maxPool3DBackward(Matrix& image,
-                                 Matrix& outGrad,
-                                 Matrix& outV,
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
@@ -1436,6 +1436,7 @@ public:
                        size_t paddingW);
 
   void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
                         size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
@@ -1453,9 +1454,8 @@ public:
                         size_t paddingH,
                         size_t paddingW);
 
-  void maxPool3DBackward(Matrix& image,
-                         Matrix& outGrad,
-                         Matrix& outV,
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
@@ -1671,6 +1671,7 @@ public:
                        size_t paddingW);
 
   void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
                         size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
@@ -1688,9 +1689,8 @@ public:
                         size_t paddingH,
                         size_t paddingW);
 
-  void maxPool3DBackward(Matrix& image,
-                         Matrix& outGrad,
-                         Matrix& outV,
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 21ee8543cd2ea7c635491f5a5cf5ea35250f73be..d7ad6f18ac0aa83f5769994f8a2304ae0c70540d 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1204,202 +1204,398 @@ TEST(Matrix, warpCTC) {
   }
 }
 
-void testMatrixPool3D(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(1, channel * depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(1, channel * depth * height * width);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padZ, strideY, true);
-  int outW = outputSize(width, filterX, padZ, strideX, true);
-
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(1, channel * colBufWidth);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(1, channel * colBufWidth);
-
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->maxPool3DForward(*cpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  gpuOutput->maxPool3DForward(*gpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  TensorCheckErr(*cpuOutput, *gpuOutput);
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
 
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->avgPool3DForward(*cpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-
-  gpuOutput->avgPool3DForward(*gpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  cpuImage->avgPool3DBackward(*cpuOutput,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-
-  gpuImage->avgPool3DBackward(*gpuOutput,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  cpuImage->maxPool3DBackward(*cpuImage,
-                              *cpuOutput,
-                              *cpuOutput,
-                              depth,
-                              height,
-                              width,
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
                               outD,
                               outH,
                               outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-
-  gpuImage->maxPool3DBackward(*gpuImage,
-                              *gpuOutput,
-                              *gpuOutput,
-                              depth,
-                              height,
-                              width,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
                               outD,
                               outH,
                               outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
-TEST(Matrix, Pool3D) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixPool3D(depth, height, width);
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
       }
     }
   }
+
+  //
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
 }
 
 #endif