Add3DPooling

2377d719 · chengduoZH · 0f3a3e98 · 2377d719 · 2377d719 · 2377d719
12 changed file
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,202 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride);
+/**
+ * @brief   Maximum pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[out]  outData     output data.
+ * @param[out]  outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride);
+/**
+ * @brief   Averge pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
 /**
 * @brief   Bilinear interpolation forward.
 *
@@ -275,4 +471,4 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride) {}
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {}
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
 inline void hl_bilinear_forward(const real* inData,
                                const size_t inImgH,
                                const size_t inImgW,

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+namespace paddle {
+REGISTER_LAYER(pool3d, Pool3DLayer);
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+  return true;
+}
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  //  imgSizeD_ = inputLayers_[0]->getOutput().getFrameDepth();
+  //  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  //  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    //    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    //    imgSizeW_ = imgSize_;
+  }
+  outputD_ = outputSize(imgSizeD_,
+                        sizeZ_,
+                        paddingD_,
+                        strideD_,
+                        /* caffeMode */ false);
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ false);
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  int batchSize = inMat->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*inMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 *outGradMat,
+                                 *outMat,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1206,6 +1206,75 @@ TEST(Layer, PoolLayer) {
 #endif
 }
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
 void testSppLayer(const string& poolType,
                  const int pyramidHeight,
                  bool trans,

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -928,15 +928,102 @@ public:
                               size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }
  /**
-   * Input: one or more sequences. Each sequence contains some instances.
+   * Pooling 3D forward operation, pick out the largest element
-   *
+   * in the sizeX of value
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
   */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void maxPool3DBackward(Matrix& image,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& outGrad,
+                                 Matrix& outV,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
  virtual void maxSequenceForward(Matrix& input,
                                  const IVector& sequence,
                                  IVector& index) {
@@ -1348,6 +1435,83 @@ public:
                       size_t paddingH,
                       size_t paddingW);
+  /////////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
  void maxSequenceForward(Matrix& input,
                          const IVector& sequence,
                          IVector& index);
@@ -1506,6 +1670,82 @@ public:
                       real scaleOutput,
                       size_t paddingH,
                       size_t paddingW);
+  //////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
  void maxSequenceForward(Matrix& input,
                          const IVector& sequence,

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -1203,4 +1204,207 @@ TEST(Matrix, warpCTC) {
  }
 }
+/////
+void testMatrixPool3D(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(1, channel * depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(1, channel * depth * height * width);
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padZ, strideY, true);
+  int outW = outputSize(width, filterX, padZ, strideX, true);
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(1, channel * colBufWidth);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(1, channel * colBufWidth);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test maxPool3DForward...\n";
+  cpuOutput->maxPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  gpuOutput->maxPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test avgPool3DForward...\n";
+  cpuOutput->avgPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  gpuOutput->avgPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test avgPool3DBackward...\n";
+  cpuImage->avgPool3DBackward(*cpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  gpuImage->avgPool3DBackward(*gpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test maxPool3DBackward...\n";
+  cpuImage->maxPool3DBackward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              *cpuOutput,
+                              *cpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  gpuImage->maxPool3DBackward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              *gpuOutput,
+                              *gpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+TEST(Matrix, Pool3D) {
+  for (auto depth : {9, 16, 64, 128}) {
+    for (auto height : {9, 11, 128, 256}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixPool3D(depth, height, width);
+      }
+    }
+  }
+}
 #endif
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
  resizeAndCopy(strs, src.strs, useGpu, stream);
  frameWidth = src.frameWidth;
  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
  dataId = src.dataId;
  frameWidth = src.frameWidth;
  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
  if (!src.sequenceStartPositions) {
    // non-sequence input, copy samples directly

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
        strs(nullptr),
        frameHeight(0),
        frameWidth(0),
+        frameDepth(0),
        sequenceStartPositions(nullptr),
        subSequenceStartPositions(nullptr),
        cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
    allCount = argument.allCount;
    frameHeight = argument.frameHeight;
    frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
    dataId = argument.dataId;
  }
@@ -76,6 +75,7 @@ struct Argument {
  // A dataBatch includes batchSize frames, one frame maybe not only vector
  size_t frameHeight;
  size_t frameWidth;
+  size_t frameDepth;
  // If NULL, each position is treated independently.
  // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
  }
  size_t getFrameHeight() const { return frameHeight; }
  size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
  void setFrameHeight(size_t h) { frameHeight = h; }
  void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
  int64_t getNumSequences() const {
    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -82,6 +82,12 @@ message ConvConfig {
  // if not set, use img_size
  optional uint32 img_size_y = 14;
+  optional uint32 filter_size_z = 15 [ default = 1 ];
+  optional uint32 padding_z = 16 [ default = 1 ];
+  optional uint32 stride_z = 17 [ default = 1 ];
+  optional uint32 output_z = 18 [ default = 1 ];
+  optional uint32 img_size_z = 19 [ default = 1 ];
 }
 message PoolConfig {
@@ -124,6 +130,12 @@ message PoolConfig {
  // if not set, use padding
  optional uint32 padding_y = 13;
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
 }
 message SppConfig {