diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 9f84db72da24b0e678520b077f9cba7ffc2d589a..e9687d0a58d946f3da065a3e5aa5a85be950ce76 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,202 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
+/**
+ * @brief   Maximum pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[out]  outData     output data.
+ * @param[out]  outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride);
+
+/**
+ * @brief   Averge pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
 /**
  * @brief   Bilinear interpolation forward.
  *
@@ -275,4 +471,4 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46..28f61781be0b9f871bcf0db296add4838fc711cc 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride) {}
 
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
                                 const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index aac19b1ea566ad69f1f7374e393676c8debd9883..458c347728952566a87a075d496149ec2dbeb9f0 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -353,6 +350,430 @@ void hl_avgpool_backward(const int frameCnt,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
+/////////////////
+__global__ void KeMaxPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int ksizeD,
+                                   const int ksizeH,
+                                   const int ksizeW,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int offsetD,
+                                   const int offsetH,
+                                   const int offsetW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - offsetD;
+    int hstart = ph * strideH - offsetH;
+    int wstart = pw * strideW - offsetW;
+    int dend = min(dstart + ksizeD, depth);
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (maxval < inputData[(d * height + h) * width + w])
+            maxval = inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+  }
+}
+
+void hl_maxpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           pooledD,
+                                                           pooledH,
+                                                           pooledW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           tgtData,
+                                                           tgtStride);
+  CHECK_SYNC("hl_maxpool3D_forward failed");
+}
+
+__global__ void KeMaxPool3DBackward(const int nthreads,
+                                    const real* inputData,
+                                    const real* outData,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* targetGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    // find out the local index
+    // find out the local offset
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    real input = inputData[index];
+
+    outData += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (input == outData[(pd * pooledH + ph) * pooledW + pw])
+            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
+  }
+}
+
+void hl_maxpool3D_backward(const int frameCnt,
+                           const real* inputData,
+                           const real* outData,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           const int paddingD,
+                           const int paddingH,
+                           const int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* targetGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           outData,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           targetGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_maxpool3D_backward");
+}
+
+__global__ void KeAvgPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int sizeZ,
+                                   const int sizeY,
+                                   const int sizeX,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + sizeZ, depth + padD);
+    int hend = min(hstart + sizeY, height + padH);
+    int wend = min(wstart + sizeX, width + padW);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dend = min(dend, depth);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
+  }
+}
+
+void hl_avgpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          inputData,
+                                                          channels,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          pooledD,
+                                                          pooledH,
+                                                          pooledW,
+                                                          sizeZ,
+                                                          sizeY,
+                                                          sizeX,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          tgtData,
+                                                          tgtStride);
+  CHECK_SYNC("hl_avgpool3D_forward failed");
+}
+
+__global__ void KeAvgPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* tgtGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * strideD - padD;
+          int hstart = ph * strideH - padH;
+          int wstart = pw * strideW - padW;
+          int dend = min(dstart + sizeZ, depth + padD);
+          int hend = min(hstart + sizeY, height + padH);
+          int wend = min(wstart + sizeX, width + padW);
+          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
+        }
+      }
+    }
+    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
+  }
+}
+
+void hl_avgpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           int paddingD,
+                           int paddingH,
+                           int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* backGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           backGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_avgpool3D_backward failed");
+}
+/////////////////
+
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc6b9bdd2f95d92a8f99f1ada6b10e4a3b44d084
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,198 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  //  imgSizeD_ = inputLayers_[0]->getOutput().getFrameDepth();
+  //  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  //  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    //    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    //    imgSizeW_ = imgSize_;
+  }
+  outputD_ = outputSize(imgSizeD_,
+                        sizeZ_,
+                        paddingD_,
+                        strideD_,
+                        /* caffeMode */ false);
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  int batchSize = inMat->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*inMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 *outGradMat,
+                                 *outMat,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..afc65ac2b0f8203e46316bd48fd4bfe0936f089b
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0f312b6ca50bc1e6317251ba785f1c61a224b54e..43fb255ae0b302fba10e871b3399a8ae1543a0db 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1206,6 +1206,75 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 void testSppLayer(const string& poolType,
                   const int pyramidHeight,
                   bool trans,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 27f7d95b752d4a423bf99fa425b10b2816575d6a..e7f1489b8ba4fe672f238eb85c52713cd0fba241 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1190,6 +1190,224 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       outGrad.getStride());
 }
 
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t num = inputMat.getHeight();
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       data_,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputD / outputH / outputW;
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == depth * width * height * channels);
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        inputData,
+                        outData,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        data_,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       data_,
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        data_,
+                        outGrad.getStride());
+}
+
 void GpuMatrix::maxSequenceForward(Matrix& input,
                                    const IVector& sequence,
                                    IVector& index) {
@@ -1930,6 +2148,290 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
   }
 }
 
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  size_t num = inputMat.getHeight();
+  size_t inWidth = imgSizeW;
+  size_t inHeight = imgSizeH;
+  size_t inDepth = imgSizeD;
+  CHECK(inHeight * inWidth * inDepth == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW * outputD, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth);
+            int hend = std::min(hstart + sizeY, inHeight);
+            int wend = std::min(wstart + sizeX, inWidth);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(pd * outputH + ph) * outputW + pw] =
+                      std::max(outData[(pd * outputH + ph) * outputW + pw],
+                               inputData[(d * inHeight + h) * inWidth + w]);
+                }
+              }
+            }
+          }
+        }
+      }
+      // compute offset
+      inputData += inDepth * inHeight * inWidth;
+      outData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& image,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  size_t num = image.getHeight();
+  size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
+  CHECK(image.getWidth() == imgSizeD * imgSizeH * imgSizeW * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, imgSizeD);
+            int hend = std::min(hstart + sizeY, imgSizeH);
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            for (int d = 0; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtGrad[(d * imgSizeH + h) * imgSizeW + w] =
+                      scaleTargets *
+                          tgtGrad[(d * imgSizeH + h) * imgSizeW + w] +
+                      scaleOutput * otGrad[(pd * outputH + ph) * outputW + pw] *
+                          (inData[(d * imgSizeH + h) * imgSizeW + w] ==
+                           otData[(pd * outputH + ph) * outputW + pw]);
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      inData += imgSizeD * imgSizeH * imgSizeW;
+      tgtGrad += imgSizeD * imgSizeH * imgSizeW;
+      otData += outputD * outputH * outputW;
+      otGrad += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inDepth = imgSizeD;
+  size_t inHeight = imgSizeH;
+  size_t inWidth = imgSizeW;
+  CHECK(inDepth * inHeight * inWidth * channels == input.getWidth());
+  CHECK(outputD * outputH * outputW * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth + paddingD);
+            int hend = std::min(hstart + sizeY, inHeight + paddingH);
+            int wend = std::min(wstart + sizeX, inWidth + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(inDepth));
+            hend = std::min(hend, static_cast<int>(inHeight));
+            wend = std::min(wend, static_cast<int>(inWidth));
+
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * inHeight + h) * inWidth + w];
+                }
+              }
+            }
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inDepth * inHeight * inWidth;
+      tgtData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, imgSizeD + paddingD);
+            int hend = std::min(hstart + sizeY, imgSizeH + paddingH);
+            int wend = std::min(wstart + sizeX, imgSizeW + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(imgSizeD));
+            hend = std::min(hend, static_cast<int>(imgSizeH));
+            wend = std::min(wend, static_cast<int>(imgSizeW));
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += imgSizeD * imgSizeH * imgSizeW;
+      inData += outputD * outputH * outputW;
+    }
+  }
+}
+
 /**
  * Input: one or more sequences. Each sequence contains some instances.
  * Output: output size is the number of input sequences (NOT input instances).
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index bb802bbb2c75289a45d987b22ad41ce8b1e95c98..f1534c5ea0db38a2591cef490c22ef485f9fecf0 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -928,15 +928,102 @@ public:
                                size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
-
   /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
    */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& image,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& outGrad,
+                                 Matrix& outV,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
   virtual void maxSequenceForward(Matrix& input,
                                   const IVector& sequence,
                                   IVector& index) {
@@ -1348,6 +1435,83 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
+  /////////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1506,6 +1670,82 @@ public:
                        real scaleOutput,
                        size_t paddingH,
                        size_t paddingW);
+  //////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index d77478f345df97b37b214b5978f51ce47c1d791c..7a961d2751f1d6ea3f7421185bd127f1fbf8ee76 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -1203,4 +1204,207 @@ TEST(Matrix, warpCTC) {
   }
 }
 
+/////
+void testMatrixPool3D(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(1, channel * depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(1, channel * depth * height * width);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padZ, strideY, true);
+  int outW = outputSize(width, filterX, padZ, strideX, true);
+
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(1, channel * colBufWidth);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(1, channel * colBufWidth);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test maxPool3DForward...\n";
+  cpuOutput->maxPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  gpuOutput->maxPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test avgPool3DForward...\n";
+  cpuOutput->avgPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuOutput->avgPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test avgPool3DBackward...\n";
+  cpuImage->avgPool3DBackward(*cpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuImage->avgPool3DBackward(*gpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test maxPool3DBackward...\n";
+  cpuImage->maxPool3DBackward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              *cpuOutput,
+                              *cpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuImage->maxPool3DBackward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              *gpuOutput,
+                              *gpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, Pool3D) {
+  for (auto depth : {9, 16, 64, 128}) {
+    for (auto height : {9, 11, 128, 256}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixPool3D(depth, height, width);
+      }
+    }
+  }
+}
+
 #endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 0547ac93cd183afbcede41d280c6b4b16ed7dab1..77fd0c5890b45add6538348a81066d84ef9c1703 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }
 
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
   dataId = src.dataId;
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index d8d7a4398f99a2794c5d25528a7d582f5ed629ba..ba3ad2fd4d99242995fad66a16ddb521e6ce9e9e 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
         strs(nullptr),
         frameHeight(0),
         frameWidth(0),
+        frameDepth(0),
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
     frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
     dataId = argument.dataId;
   }
 
@@ -76,6 +75,7 @@ struct Argument {
   // A dataBatch includes batchSize frames, one frame maybe not only vector
   size_t frameHeight;
   size_t frameWidth;
+  size_t frameDepth;
 
   // If NULL, each position is treated independently.
   // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
   }
   size_t getFrameHeight() const { return frameHeight; }
   size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
   void setFrameHeight(size_t h) { frameHeight = h; }
   void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
 
   int64_t getNumSequences() const {
     return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5..42cf10e9d3f91303da6c9c6dd8351336b599abf0 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -82,6 +82,12 @@ message ConvConfig {
 
   // if not set, use img_size
   optional uint32 img_size_y = 14;
+
+  optional uint32 filter_size_z = 15 [ default = 1 ];
+  optional uint32 padding_z = 16 [ default = 1 ];
+  optional uint32 stride_z = 17 [ default = 1 ];
+  optional uint32 output_z = 18 [ default = 1 ];
+  optional uint32 img_size_z = 19 [ default = 1 ];
 }
 
 message PoolConfig {
@@ -124,6 +130,12 @@ message PoolConfig {
 
   // if not set, use padding
   optional uint32 padding_y = 13;
+
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
 }
 
 message SppConfig {