Merge branch 'develop' into build_ios

d1f5f498 · Liu Yiqun · fb93a8be · c1feb27f · d1f5f498 · d1f5f498
73 changed file
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -26,6 +26,7 @@ IF(NOT ${CBLAS_FOUND})
        CACHE FILEPATH "openblas library." FORCE)

    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+
    IF(CMAKE_CROSSCOMPILING)
        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
@@ -52,11 +53,14 @@ IF(NOT ${CBLAS_FOUND})
            ENDIF()
        ELSEIF(RPI)
            # use hardfp
-            SET(OPENBLAS_COMMIT "v0.2.19")
+            SET(OPENBLAS_COMMIT "v0.2.20")
            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
        ENDIF()
    ELSE()
-        SET(OPENBLAS_COMMIT "v0.2.19")
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
+        SET(OPENBLAS_COMMIT "v0.2.20")
        SET(OPTIONAL_ARGS "")
        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)

--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template <typename T> T add(T x, T y) { return x + y; }
+template <typename T> T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template <typename T> class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp<float32>, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template <typename T>
+class FCOp : public OperatorBase {
+ public:
+  void Run(...) {
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
+  }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
+    return O
+
+def operator.add(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("add", input={X1, X2], output=O)
+    return O
+```
+
+Above code snippets are automatically generated.  Given them, users can define
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    O1 = Var()
+    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+    O2 = Var()
+    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+    return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
+
+```
+| C++ functions/functors | mul          | add          |             |          |
+| C++ operator class     | mulOp        | addOp        | FCOp        |          |
+| Python binding         | operator.mul | operator.add | operator.fc |          |
+| Python function        |              |              |             | layer.fc |
+```
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
+IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
+
+```python
+import paddle as pd
+
+x = var()
+y = var()
+cond = var()
+
+b = pd.create_ifop(inputs=[x], output_num=1)
+with b.true_block():
+    x = b.inputs(0)
+    z = operator.add(x, y)
+    b.set_output(0, operator.softmax(z))
+
+out = b(cond)
+```
+
+If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
+
+```python
+import paddle as pd
+
+x = var()
+y = var()
+cond = var()
+default_value = var()
+b = pd.create_ifelseop(inputs=[x], output_num=1)
+with b.true_block():
+    x = b.inputs(0)
+    z = operator.add(x, y)
+    b.set_output(0, operator.softmax(z))
+
+with b.false_block():
+    x = b.inputs(0)
+    z = layer.fc(x)
+    b.set_output(0, operator.softmax(z))
+
+out = b(cond)
+```
+
+If only true_block is set in an IfElseOp, we can have a default value for false as:
+```python
+import paddle as pd
+
+x = var()
+y = var()
+cond = var()
+default_value = var()
+b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
+
+with b.true_block():
+    x = b.inputs(0)
+    z = operator.add(x, y)
+    b.set_output(0, operator.softmax(z))
+
+out = b(cond)
+```
+where default_value is a list of vars for `cond` == False.
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -178,13 +178,13 @@ class MulKernel : public framework::OpKernel {

 ```c++
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mul_grad,
              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
 ```
    
-  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
+  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，并且注册`ops::MulOpGrad`为其反向Op。
  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。


--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride);

+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
 /**
 * @brief   Bilinear interpolation forward.
 *
@@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);

-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise);

+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[out]   dataDst     output matrix.
+ *
+ */
+extern void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input
+ * @param[in]   alpha      input
+ *
+ */
+extern void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta);
+
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride) {}

+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
 inline void hl_bilinear_forward(const real* inData,
                                const size_t inImgH,
                                const size_t inImgW,

--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}

+inline void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst) {}
+
+inline void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta) {}
+
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -592,3 +592,204 @@ void hl_matrix_rotate(
      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@@ -18,7 +18,7 @@ A backward network is built up with several backward operators. Backward operato
 For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:

 ```cpp
-REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
+REGISTER_OP(mul, MulOp, MulOpMaker, MulOpGrad);
 ```

 `mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
 public:
  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("out", "out");
+    AddInput("Src", "x");
+    AddOutput("Dst", "out");
    AddComment("");
  }
 };
@@ -138,7 +138,7 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "x").AsDuplicable();
-    AddOutput("Y", "y");
+    AddOutput("Out", "out");
    AddComment("");
  }
 };
@@ -148,16 +148,14 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
 namespace f = paddle::framework;
 namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
-REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
-            f::NOP);
-REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
-REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
+REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, f::NOP);
+REGISTER_OP(mul, f::NOP, f::MulOpMaker, f::NOP);
+REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+REGISTER_OP(add, f::NOP, f::AddOpMaker, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
-            f::NOP);
+REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, f::NOP);

 TEST(Backward, simple_op_grad) {
  auto fwd = f::OpRegistry::CreateOp(

--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -54,8 +54,8 @@ TEST(GradOpBuilder, AddTwo) {
  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
 }

-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
+REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, f::NOP);
+REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, f::NOP);

 TEST(GradOpBuilder, MutiInOut) {
  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -19,25 +19,24 @@
 namespace paddle {
 namespace framework {

-LODTensor::LOD LODTensor::LOD::SliceLevels(size_t level_begin,
-                                           size_t level_end) const {
+LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
  LOD new_lod;
  new_lod.reserve(level_end - level_begin);
  for (size_t i = level_begin; i < level_end; i++) {
-    new_lod.emplace_back(at(i));
+    new_lod.emplace_back(in.at(i));
  }
  return new_lod;
 }

-LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
-                                            size_t elem_end) const {
+LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+                 size_t elem_end) {
  // slice the lod.
  LOD new_lod;
-  new_lod.reserve(size() - level);
-  auto start = this->at(level)[elem_begin];
-  auto end = this->at(level)[elem_end];
+  new_lod.reserve(in.size() - level);
+  auto start = in.at(level)[elem_begin];
+  auto end = in.at(level)[elem_end];

-  for (auto it = this->begin() + level; it != this->end(); it++) {
+  for (auto it = in.begin() + level; it != in.end(); it++) {
    auto it_begin = std::find(it->begin(), it->end(), start);
    auto it_end = std::find(it_begin, it->end(), end);
    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
@@ -49,11 +48,11 @@ LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
                   [start](int v) { return v - start; });
    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
  }
-  PADDLE_ENFORCE_LE(new_lod.size(), this->size());
+  PADDLE_ENFORCE_LE(new_lod.size(), in.size());
  return new_lod;
 }

-bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
+bool operator==(const LOD& a, const LOD& b) {
  if (a.size() != b.size()) {
    return false;
  }
@@ -70,9 +69,27 @@ bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
      }
    }
  }
-
  return true;
 }

+void LODTensor::SliceLevels(size_t level_begin, size_t level_end) {
+  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
+  lod_ = new_lod;
+}
+
+void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
+  lod_ = new_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once

 #include <memory>
-#if !defined(PADDLE_ONLY_CPU)
+#ifndef PADDLE_ONLY_CPU
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -27,33 +27,39 @@
 namespace paddle {
 namespace framework {

+#ifdef PADDLE_ONLY_CPU
+template <typename T>
+using Vector = std::vector<T>;
+#else
+template <typename T>
+using Vector = thrust::host_vector<T>;
+#endif
+
+using LOD = std::vector<Vector<size_t>>;
+
+LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end);
+
+LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+                 size_t elem_end);
+
+bool operator==(const LOD& a, const LOD& b);
+
 /*
 * LODTensor (Level of details Tensor)
 * see https://en.wikipedia.org/wiki/Level_of_details for reference.
 */
-class LODTensor : public Tensor {
+class LODTensor {
 public:
-// Level save offsets of each unit.
-#ifdef PADDLE_ONLY_CPU
-  template <typename T>
-  using Vector = std::vector<T>;
-#else
-  template <typename T>
-  using Vector = thrust::host_vector<T>;
-#endif
-  // LoD stores offsets of each level of units, the largest units level first,
-  // then the smaller units level. Each Level stores the offsets of units in
-  // Tesor.
-  class LOD : public std::vector<Vector<size_t>> {
-   public:
-    LOD SliceLevels(size_t level_begin, size_t level_end) const;
-    LOD SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) const;
-  };
-
  LODTensor() {}
-  explicit LODTensor(const LOD &lod) : lod_(lod) {}
+  LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
+
+  void set_lod(const LOD& lod) { lod_ = lod; }

-  virtual Tensor *Clone() const { return new LODTensor(lod_); }
+  void set_tensor(Tensor* tensor) { tensor_ = tensor; }
+
+  Tensor& tensor() { return *tensor_; }
+
+  LOD lod() { return lod_; }

  /*
   * Get a element from LOD.
@@ -79,71 +85,23 @@ class LODTensor : public Tensor {
    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
                   NumLevels());
    // the last offset is the end of last element
-    return lod_[level].size() - 1;
+    return (lod_)[level].size() - 1;
  }

  /*
-   * Slice of levels[level_begin:level_end], with tensor shared.
+   * Slice of levels[level_begin:level_end]
   */
-  template <typename T>
-  LODTensor SliceLevels(size_t level_begin, size_t level_end) const;
+  void SliceLevels(size_t level_begin, size_t level_end);

  /*
-   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * Slice of elements of a level, [elem_begin: elem_end]
   * @note: low performance in slice lod_.
   */
-  template <typename T>
-  LODTensor SliceInLevel(size_t level, size_t elem_begin,
-                         size_t elem_end) const;
-
-  /*
-   * Copy other's lod_'s content, free to mutate.
-   */
-  void CopyLOD(const LODTensor &other) { lod_ = other.lod_; }
-  /*
-   * Determine whether LODTensor has a valid LOD info.
-   */
-  const LOD &lod() const { return lod_; }
-  LOD *mutable_lod() { return &lod_; }
-
-  virtual ~LODTensor() {}
+  void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);

 private:
  LOD lod_;
+  Tensor* tensor_;  // not owned
 };
-
-bool operator==(const LODTensor::LOD &a, const LODTensor::LOD &b);
-
-template <typename T>
-LODTensor LODTensor::SliceLevels(size_t level_begin, size_t level_end) const {
-  auto new_lod = lod_.SliceLevels(level_begin, level_end);
-  // slice levels just need to update LOD info, each level will contains the
-  // whole tensor_, so no need to modify tensor_.
-  LODTensor new_tensor(new_lod);
-  new_tensor.ShareDataWith<T>(*this);
-  return new_tensor;
-}
-
-template <typename T>
-LODTensor LODTensor::SliceInLevel(size_t level, size_t elem_begin,
-                                  size_t elem_end) const {
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
-
-  auto new_lod = lod_.SliceInLevel(level, elem_begin, elem_end);
-
-  // slice elements just need to update LOD info, because offsets are not
-  // changed, so the original tensor_ can be reused.
-  LODTensor new_tensor(new_lod);
-  new_tensor.ShareDataWith<T>(*this);
-  return new_tensor;
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+PaddlePaddle's RNN doesn't require that all instances have the same length.  To do so, we introduce an extension to Tensor, namely, LoD Tensor.
+
+## Challenge of Variable-length Inputs
+
+People usually represent a mini-batch by a Tensor.  For example, a mini-batch of 32 images, each of size 32x32, is a 10x32x32 Tensor.  So a transformation, T, of all images can be a matrix multiplication of the 32x32xO-dimensional tensor T and the 10x32x32 Tensor.
+
+Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector.  If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor.  However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
+
+## LoD as a Solution
+
+### Mini-Batch of variable-length sentenses
+
+Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+
+```
+   3
+3   1 2
+||| | ||
+```
+
+Each `|` represents a D-dimensional word vectors.  The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
+
+### Mini-Batch of variable-length videos
+
+This approach generalizes to the case where elements are not words, but higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively.  The underlying tensor is of size (3+1+2)x640x480.  The index information illustrates as:
+
+```
+     3
+3     1  2
+口口口 口 口口
+```
+
+where each `口` represents an image.
+
+### Mini-Batch of fixed-size images
+
+Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images.  The LoD Tensor representation is
+
+```
+     M
+1 1 1 1     1
+口口口口 ... 口
+```
+
+The many 1's on the second level seem duplicated.  For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
+
+### Design and summarization
+
+In summary, as long as that the essential elements (words  or images) have the same size, we can represent mini-batches by a LoD Tensor:
+
+- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
+- the first dimension size L has an additon property -- a LoD index as a nested vector:
+
+  ```c++
+  typedef std::vector<std::vector> > LoD;
+  ```
+
+- The LoD index can is not necessary when there are only two levels and all elements of the second level have length 1.
+
+## Slicing of LoD Tensor
+
+Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words.  This network requires that mini-batches represented by 4 level LoD Tensor, for example,
+
+```
+         3
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
+
+For example, the <2,1>-slice of above slice is
+
+```
+2
+||
+```
+
+and the <1,2>-slice of above example is
+
+```
+2
+2  3
+|| |||
+```
+
+Let's go on slicing this slice.  Its <1,1>-slice is
+
+```
+3
+|||
+```
+
+### The General Slicing Algorithm
+
+The algorithm, with over-simplified data structure, is defined as
+
+```c++
+typedef vector<vector<int> > LoD;
+
+struct LoDTensor {
+  LoD lod_;
+  float* tensor_;
+};
+
+LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) {
+
+}
+```
+
+### Slicing the Top Level
+
+Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs.
+
+```c++
+LoDTensor Slice(const LoDTensor& lodt, int sequence) {
+
+}
+```
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -24,13 +24,12 @@ namespace framework {
 class LODTensorTester : public ::testing::Test {
 public:
  virtual void SetUp() override {
-    lod_tensor.reset(new LODTensor);
    // tensor's batch_size: 30
    // 3 levels
    // 0 10 20
    // 0 5 10 15 20
    // 0 2 5 7 10 12 15 20
-    LODTensor::LOD lod;
+    LOD lod;
    lod.push_back(std::vector<size_t>{0, 10, 20});
    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
    lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
@@ -41,75 +40,65 @@ class LODTensorTester : public ::testing::Test {
    // malloc memory
    tensor.mutable_data<float>(place);

-    lod_tensor.reset(new LODTensor(lod));
-    lod_tensor->Resize({20 /*batch size*/, 128 /*dim*/});
-
-    lod_tensor->ShareDataWith<float>(tensor);
-    // lod_tensor->ShareDataWith<Tensor>(tensor);
+    lod_tensor.set_lod(lod);
+    lod_tensor.set_tensor(&tensor);
  }

 protected:
-  std::unique_ptr<LODTensor> lod_tensor;
  platform::CPUPlace place;
  Tensor tensor;
+  LODTensor lod_tensor;
 };

-TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }

 TEST_F(LODTensorTester, NumElements) {
-  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
-  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+  ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
 }

 TEST_F(LODTensorTester, SliceLevels) {
  // slice 1 level
  for (size_t level = 0; level < 3UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 1);
+    LODTensor new_lod_tensor = lod_tensor;
+    new_lod_tensor.SliceLevels(level, level + 1);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
-    // ASSERT_EQ(new_lod_tensor, *lod_tensor);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+              lod_tensor.tensor().data<float>());
  }
  // slice 2 level
  for (size_t level = 0; level < 2UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 2);
+    LODTensor new_lod_tensor = lod_tensor;
+    new_lod_tensor.SliceLevels(level, level + 2);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor->NumElements(level + 1));
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1), lod_tensor.NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+              lod_tensor.tensor().data<float>());
  }
 }

 TEST_F(LODTensorTester, SliceInLevel) {
  size_t level = 0;
-  auto new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  LODTensor new_lod_tensor = lod_tensor;
+  new_lod_tensor.SliceInLevel(level, 0, 2);
  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
-  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+            lod_tensor.tensor().data<float>());

  level = 1;
-  new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  new_lod_tensor = lod_tensor;
+  new_lod_tensor.SliceInLevel(level, 0, 2);
  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
-}
-
-TEST_F(LODTensorTester, ShareLOD) {
-  LODTensor new_lod_tensor;
-  new_lod_tensor.CopyLOD(*lod_tensor);
-  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
-}
-
-TEST_F(LODTensorTester, CopyLOD) {
-  LODTensor new_lod_tensor;
-  new_lod_tensor.CopyLOD(*lod_tensor);
-  bool equals = std::equal(lod_tensor->lod().begin(), lod_tensor->lod().end(),
-                           new_lod_tensor.lod().begin());
-  ASSERT_TRUE(equals);
+  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+            lod_tensor.tensor().data<float>());
 }

 }  // namespace framework

--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -80,9 +80,19 @@ class OpInfoMap {
  }

  const OpInfo& Get(const std::string& type) const {
+    auto op_info_ptr = GetNullable(type);
+    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
+                            type);
+    return *op_info_ptr;
+  }
+
+  const OpInfo* GetNullable(const std::string& type) const {
    auto it = map_.find(type);
-    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
-    return it->second;
+    if (it == map_.end()) {
+      return nullptr;
+    } else {
+      return &it->second;
+    }
  }

  template <typename Callback>

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -33,8 +33,7 @@ namespace framework {
 class OpRegistry {
 public:
  template <typename OpType, typename ProtoMakerType, typename GradOpType>
-  static void RegisterOp(const std::string& op_type,
-                         const std::string& grad_op_type) {
+  static void RegisterOp(const std::string& op_type) {
    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                   "'%s' is registered more than once.", op_type);
    OpInfo op_info;
@@ -43,9 +42,9 @@ class OpRegistry {
        const VariableNameMap& outputs, const AttributeMap& attrs) {
      return new OpType(type, inputs, outputs, attrs);
    };
-    op_info.grad_op_type_ = grad_op_type;
    if (std::type_index(typeid(ProtoMakerType)) !=
        std::type_index(typeid(NOPMaker))) {
+      op_info.grad_op_type_ = op_type + "_grad";
      op_info.proto_ = new OpProto;
      op_info.checker_ = new OpAttrChecker;
      auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
@@ -55,15 +54,14 @@ class OpRegistry {
          op_info.proto_->IsInitialized(),
          "Fail to initialize %s's OpProto, because %s is not initialized",
          op_type, op_info.proto_->InitializationErrorString());
+      // register gradient op
+      RegisterOp<GradOpType, NOPMaker, NOP>(op_info.grad_op_type_);
    } else {
+      op_info.grad_op_type_ = "";
      op_info.proto_ = nullptr;
      op_info.checker_ = nullptr;
    }
    OpInfoMap::Instance().Insert(op_type, op_info);
-    // register gradient op
-    if (!grad_op_type.empty()) {
-      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
-    }
  }

  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
@@ -92,10 +90,8 @@ class Registrar {
 template <typename OpType, typename ProtoMakerType, typename GradOpType>
 class OpRegistrar : public Registrar {
 public:
-  explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
-  OpRegistrar(const char* op_type, const char* grad_op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
-                                                               grad_op_type);
+  explicit OpRegistrar(const char* op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type);
  }
 };

@@ -121,8 +117,7 @@ class OpKernelRegistrar : public Registrar {
 /**
 * Macro to register Operator.
 */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
-                    grad_op_class)                                            \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_class)         \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
  class _OpClass_##op_type##_ : public op_class {                             \
@@ -137,14 +132,14 @@ class OpKernelRegistrar : public Registrar {
  };                                                                          \
  static ::paddle::framework::OpRegistrar<                                    \
      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
+      __op_registrar_##op_type##__(#op_type);                                 \
  int TouchOpRegistrar_##op_type() {                                          \
    __op_registrar_##op_type##__.Touch();                                     \
    return 0;                                                                 \
  }

 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+  REGISTER_OP(op_type, op_class, op_maker_class, ::paddle::framework::NOP)

 /**
 * Macro to register OperatorKernel.

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -33,12 +33,12 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 }
 #endif

-const std::string& OperatorBase::Input(const std::string& name) const {
+std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
-  PADDLE_ENFORCE_EQ(ins.size(), 1UL,
+  PADDLE_ENFORCE_LE(ins.size(), 1UL,
                    "Op %s input %s should contain only one variable", type_,
                    name);
-  return ins[0];
+  return ins.empty() ? kEmptyVarName : ins[0];
 }

 const std::vector<std::string>& OperatorBase::Inputs(
@@ -49,12 +49,12 @@ const std::vector<std::string>& OperatorBase::Inputs(
  return it->second;
 }

-const std::string& OperatorBase::Output(const std::string& name) const {
+std::string OperatorBase::Output(const std::string& name) const {
  auto& outs = Outputs(name);
-  PADDLE_ENFORCE_EQ(outs.size(), 1UL,
+  PADDLE_ENFORCE_LE(outs.size(), 1UL,
                    "Op %s output %s should contain only one variable", type_,
                    name);
-  return outs[0];
+  return outs.empty() ? kEmptyVarName : outs[0];
 }

 const std::vector<std::string>& OperatorBase::Outputs(
@@ -119,16 +119,8 @@ OperatorBase::OperatorBase(const std::string& type,
                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
-  static std::atomic<size_t> gUniqId(0UL);
-  for (auto& output : outputs_) {
-    for (auto& output_name : output.second) {
-      if (output_name == kTempVarName) {
-        output_name += type_;
-        output_name += "@";
-        output_name += std::to_string(gUniqId.fetch_add(1));
-      }
-    }
-  }
+  GenerateTemporaryNames();
+  CheckAllInputOutputSet();
 }

 std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
@@ -156,6 +148,35 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
  return ret_val;
 }

+void OperatorBase::CheckAllInputOutputSet() const {
+  auto& info_map = OpInfoMap::Instance();
+  auto* op_info = info_map.GetNullable(Type());
+  if (op_info == nullptr || op_info->proto_ == nullptr) return;
+
+  for (auto& in : op_info->Proto().inputs()) {
+    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                   "Type %s's input %s is not set", Type(), in.name());
+  }
+
+  for (auto& out : op_info->Proto().outputs()) {
+    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                   "Type %s's output %s is not set", Type(), out.name());
+  }
+}
+
+void OperatorBase::GenerateTemporaryNames() {
+  static std::atomic<size_t> gUniqId(0UL);
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
+      if (output_name == kTempVarName) {
+        output_name += type_;
+        output_name += "@";
+        output_name += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+}
+
 void OpProtoAndCheckerMaker::Validate() {
  validated_ = true;
  CheckNoDuplicatedInOutAttrs();

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -95,12 +95,12 @@ class OperatorBase {
  const VariableNameMap& Inputs() const { return inputs_; }
  const VariableNameMap& Outputs() const { return outputs_; }
  //! Get a input with argument's name described in `op_proto`
-  const std::string& Input(const std::string& name) const;
+  std::string Input(const std::string& name) const;
  //! Get a input which has multiple variables.
  const std::vector<std::string>& Inputs(const std::string& name) const;

  //! Get a output with argument's name described in `op_proto`
-  const std::string& Output(const std::string& name) const;
+  std::string Output(const std::string& name) const;
  //! Get an output which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
  const std::vector<std::string>& Outputs(const std::string& name) const;
@@ -127,6 +127,10 @@ class OperatorBase {
  // IG (Inputs Gradients)
  VariableNameMap outputs_;
  AttributeMap attrs_;
+
+ private:
+  void GenerateTemporaryNames();
+  void CheckAllInputOutputSet() const;
 };

 // Macro for define a clone method.
@@ -238,11 +242,13 @@ class InferShapeContext {
  }

  const Variable* InputVar(const std::string& name) const {
-    return scope_.FindVar(op_.Input(name));
+    auto ipt = op_.Input(name);
+    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
  }

  Variable* OutputVar(const std::string& name) const {
-    return scope_.FindVar(op_.Output(name));
+    auto opt = op_.Output(name);
+    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
  }

  const std::vector<const Variable*> MultiInputVar(
@@ -250,9 +256,11 @@ class InferShapeContext {
    auto names = op_.Inputs(name);
    std::vector<const Variable*> res;
    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [this](const std::string& name) { return scope_.FindVar(name); });
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
    return res;
  }

@@ -260,24 +268,24 @@ class InferShapeContext {
    auto names = op_.Outputs(name);
    std::vector<const Variable*> res;
    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [this](const std::string& name) { return scope_.FindVar(name); });
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
    return res;
  }

  template <typename T>
  const T* Input(const std::string& name) const {
    auto* var = InputVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name);
-    return &var->Get<T>();
+    return var == nullptr ? nullptr : &var->Get<T>();
  }

  template <typename T>
  T* Output(const std::string& name) const {
    auto var = OutputVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Output(%s) should not be nullptr", name);
-    return var->GetMutable<T>();
+    return var == nullptr ? nullptr : var->GetMutable<T>();
  }

  template <typename T>
@@ -288,10 +296,7 @@ class InferShapeContext {
    std::transform(names.begin(), names.end(), std::back_inserter(res),
                   [&](const std::string& sub_name) {
                     auto var = scope_.FindVar(sub_name);
-                     PADDLE_ENFORCE_NOT_NULL(
-                         var, "MultiInput(%s:%s) should not be nullptr", name,
-                         sub_name);
-                     return &var->Get<T>();
+                     return var == nullptr ? nullptr : &var->Get<T>();
                   });
    return res;
  }
@@ -304,10 +309,7 @@ class InferShapeContext {
    std::transform(names.begin(), names.end(), std::back_inserter(res),
                   [&](const std::string& sub_name) {
                     auto var = scope_.FindVar(sub_name);
-                     PADDLE_ENFORCE_NOT_NULL(
-                         var, "MultiOutput(%s:%s) should not be nullptr.", name,
-                         sub_name);
-                     return var->GetMutable<T>();
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
                   });
    return res;
  }

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
  }
+  PADDLE_ENFORCE(cudaStreamSynchronize(0),
+                 "cudaStreamSynchronize failed in Tensor CopyFrom");

 #endif
 }

--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr outGradMat = getOutputGrad();
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -38,7 +38,6 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    strideY_.push_back(conf.stride_y());
    dilationY_.push_back(conf.dilation_y());
    filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                              : conf.img_size());
@@ -47,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterChannels_.push_back(conf.filter_channels());
    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
    outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
  }

  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }

-  /* initialize the biases_ */
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
+  // create new weights_ in derived class
+  // create new biases_ in derived class

  // default caffe model
  caffeMode_ = true;

--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -62,6 +62,13 @@ protected:
  IntV outputH_;
  /// The spatial dimensions of output feature map width.
  IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,

--- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
@@ -223,7 +223,7 @@ void CrossEntropyOverBeam::checkInputs() {
                                << inputLayers_[i * 3]->getName()
                                << " should be a nested sequence";
      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
-      CHECK_EQ(scores.getNumSequences(), batchSize_);
+      CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
    } else {
      CHECK(scores.hasSeq()) << "input " << i << " "
@@ -231,10 +231,10 @@ void CrossEntropyOverBeam::checkInputs() {
                             << " should be a sequence";
      batchSize_ = scores.getNumSequences();
      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
-      CHECK_EQ(batchSize_, selCandidates.getBatchSize());
+      CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
    }
    CHECK_EQ(1U, scores.value->getWidth());
-    CHECK_EQ(batchSize_, goldSeq.getBatchSize());
+    CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
  }
 }

@@ -377,8 +377,8 @@ void CrossEntropyOverBeam::forward(PassType passType) {

  MatrixPtr outputValue = getOutputValue();
  for (size_t i = 0; i < batchSize_; ++i) {
-    beamCosts_[i].setData(
-        std::move(std::make_shared<BeamExpansion>(beamPerSeq_[i])), beamSize_);
+    BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
+    beamCosts_[i].setData(std::move(ptr), beamSize_);
    outputValue->getData()[i] = beamCosts_[i].forward();
  }
 }

--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -46,8 +46,26 @@ bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
    projConf_.emplace_back(conf);
    projections_.emplace_back(
        Projection::create(*projConf_[i], parameters_[i], useGpu_));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
  }

+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
  if (biases_.get() && sharedBiases_) {
    hl_create_tensor_descriptor(&biasDesc_);
    hl_create_tensor_descriptor(&outputDesc_);

--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DeConv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(deconv3d, DeConv3DLayer);
+
+bool DeConv3DLayer::init(const LayerMap &layerMap,
+                         const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  // for Deconv, the dimension of Kernel is
+  // channel * output * depth * height * weigth
+  // Matrix storage format: (output * depth * height * weigth) x  channel
+  for (int index = 0; index < config_.inputs().size(); ++index) {
+    M_.push_back(filterChannels_[index]);
+    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * numFilters_;
+    width = filterChannels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t DeConv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  NOut_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(
+        imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(imageSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(imageSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += NOut_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void DeConv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    MatrixPtr wMat = weights_[i]->getW();
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    for (int n = 0; n < batchSize; ++n) {
+      real *inData = inMat->getData() + n * inMat->getStride();
+      for (int g = 0; g < groups_[i]; ++g) {
+        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
+        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
+        inData += M * N;
+      }
+      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
+                       numFilters_,
+                       outputD_[i],
+                       outputH_[i],
+                       outputW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i],
+                       1.0,
+                       1.0);
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void DeConv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  int batchSize = getOutputGrad()->getHeight();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad() || this->needGradient_) {
+      int M = M_[i];
+      int N = N_[i];
+      int K = K_[i];
+      REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
+      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+      const MatrixPtr &inMat = getInputValue(i);
+      for (int n = 0; n < batchSize; ++n) {
+        colBuf_->vol2Col(
+            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
+            numFilters_,
+            outputD_[i],
+            outputH_[i],
+            outputW_[i],
+            filterSizeZ_[i],
+            filterSizeY_[i],
+            filterSize_[i],
+            strideZ_[i],
+            strideY_[i],
+            stride_[i],
+            paddingZ_[i],
+            paddingY_[i],
+            padding_[i]);
+        if (weights_[i]->getWGrad()) {
+          real *inData = inMat->getData() + n * inMat->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+            MatrixPtr wGradMatSub =
+                weights_[i]->getWGrad()->subMatrix(g * K, K);
+            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+            wGradMatSub->mul(
+                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
+            inData += M * N;
+          }
+        }
+        if (getInputGrad(i)) {
+          real *preGrad =
+              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
+            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
+            MatrixPtr inGradMatSub =
+                Matrix::create(preGrad, M, N, false, useGpu_);
+            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
+            preGrad += M * N;
+          }
+        }
+      }
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+void DeConv3DLayer::bpropWeights(int i) {}
+void DeConv3DLayer::bpropData(int i) {}
+
+void DeConv3DLayer::bpropBiases() {
+  const MatrixPtr &outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void DeConv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ b/paddle/gserver/layers/DeConv3DLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,12 +22,31 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
  /* Initialize the basic convolutional parent class */
  ConvBaseLayer::init(layerMap, parameterMap);

+  int index = 0;
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
-  }

+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
  getOutputSize();

  return true;

--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -48,7 +48,16 @@ public:
                 << inputLayers_.size() << ") at " << getName();
    }
    s << format.substr(pos);
-    LOG(INFO) << s.str();
+
+    const std::string delimiter("\n");
+    std::string content = s.str();
+    std::string::size_type foundPos = 0;
+    std::string::size_type prevPos = 0;
+    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
+      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
+      prevPos = foundPos + delimiter.size();
+    }
+    LOG(INFO) << content.substr(prevPos);
  }

  void backward(const UpdateCallback& callback) override {}

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1246,6 +1246,75 @@ TEST(Layer, PoolLayer) {
 #endif
 }

+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 void testSppLayer(const string& poolType,
                  const int pyramidHeight,
                  bool trans,
@@ -2047,6 +2116,159 @@ TEST(Layer, RowL2NormLayer) {
  }
 }

+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(imageSize(conv->img_size(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_output_y(imageSize(conv->img_size_y(),
+                               conv->filter_size_y(),
+                               conv->padding_y(),
+                               conv->stride_y(),
+                               true));
+  conv->set_output_z(imageSize(conv->img_size_z(),
+                               conv->filter_size_z(),
+                               conv->padding_z(),
+                               conv->stride_z(),
+                               true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 TEST(Layer, ScaleShiftLayer) {
  const size_t batchSize = 16;
  const size_t size = 32;

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -928,15 +928,102 @@ public:
                               size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }
-
  /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
   */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
  virtual void maxSequenceForward(Matrix& input,
                                  const IVector& sequence,
                                  IVector& index) {
@@ -1039,6 +1126,42 @@ public:
    LOG(FATAL) << "Not implemented";
  }

+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
  virtual void bilinearForward(const Matrix& in,
                               const size_t inImgH,
                               const size_t inImgW,
@@ -1348,6 +1471,82 @@ public:
                       size_t paddingH,
                       size_t paddingW);

+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
  void maxSequenceForward(Matrix& input,
                          const IVector& sequence,
                          IVector& index);
@@ -1374,6 +1573,38 @@ public:
                        const real ratioH,
                        const real ratioW);

+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);

  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
@@ -1507,6 +1738,82 @@ public:
                       size_t paddingH,
                       size_t paddingW);

+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
  void maxSequenceForward(Matrix& input,
                          const IVector& sequence,
                          IVector& index);
@@ -1715,6 +2022,38 @@ public:
                        const real ratioH,
                        const real ratioW);

+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
  template <typename ExpressionType>
  void operator=(const ExpressionType& expr) {
    TensorCpuApply<real>(*this, expr);

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,6 +18,7 @@ limitations under the License. */

 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -1203,4 +1204,497 @@ TEST(Matrix, warpCTC) {
  }
 }

+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
 #endif
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
+file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
    # for ops.
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
    set(op_common_deps operator op_registry)
@@ -43,33 +46,26 @@ endfunction()

 add_subdirectory(math)

-cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-op_library(gather_op SRCS gather_op.cc gather_op.cu)
-
-cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-op_library(scatter_op SRCS scatter_op.cc scatter_op.cu)
-
-cc_library(net_op SRCS net_op.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
-
-op_library(add_op SRCS add_op.cc add_op.cu)
-
-op_library(mean_op SRCS mean_op.cc mean_op.cu)
+list(REMOVE_ITEM GENERAL_OPS
+     net_op
+     minus_op
+     mul_op
+     recurrent_op
+     scale_op)

+op_library(net_op SRCS net_op.cc)
+op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
+  DEPS framework_proto tensor operator net_op)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)

-op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
-op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
-op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
-op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
-op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
+foreach(src ${GENERAL_OPS})
+    op_library(${src} SRCS ${src}.cc ${src}.cu)
+endforeach()

-op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

-op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
-    DEPS framework_proto tensor op_registry operator net_op)
-op_library(uniform_random_op SRCS uniform_random_op.cc uniform_random_op.cu)
-op_library(lookup_table_op SRCS lookup_table_op.cc lookup_table_op.cu)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -57,7 +57,7 @@ class AddOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad);
+REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, ops::AddOpGrad);

 REGISTER_OP_CPU_KERNEL(add_two,
                       ops::AddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -67,8 +67,7 @@ OnehotCrossEntropy Operator.

 namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
-            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
-            ops::OnehotCrossEntropyGradientOp);
+            ops::OnehotCrossEntropyOpMaker, ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
                       ops::OnehotCrossEntropyOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -63,8 +63,7 @@ Out = X[Index]
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather,
                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -66,7 +66,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {

 namespace ops = paddle::operators;
 REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
-            lookup_table_grad, ops::LookupTableOpGrad);
+            ops::LookupTableOpGrad);

 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -54,7 +54,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
+REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean,
                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mean_grad,

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -81,7 +81,6 @@ class MinusGradOp : public NetOp {
 USE_OP(scale);
 USE_OP_ITSELF(identity);
 namespace ops = paddle::operators;
-REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
-            ops::MinusGradOp<float>);
+REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradOp<float>);
 REGISTER_OP_CPU_KERNEL(minus,
                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -84,7 +84,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mul_grad,
                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -74,7 +74,7 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {

 namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            rowwise_add_grad, ops::RowwiseAddGradOp);
+            ops::RowwiseAddGradOp);
 REGISTER_OP_CPU_KERNEL(
    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -97,7 +97,7 @@ class IdentityOp : public NetOp {

 namespace ops = paddle::operators;

-REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
+REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
            ops::ScaleGradOp<float>);
 REGISTER_OP_CPU_KERNEL(scale,
                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -77,8 +77,7 @@ Out[Index] = Ref[Index] + Updates
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter,
                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -53,8 +53,7 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::SigmoidOpGrad);
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, ops::SigmoidOpGrad);
 REGISTER_OP_CPU_KERNEL(sigmoid,
                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -62,8 +62,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {

 namespace ops = paddle::operators;

-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax,
                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
  resizeAndCopy(strs, src.strs, useGpu, stream);
  frameWidth = src.frameWidth;
  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }

 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
  dataId = src.dataId;
  frameWidth = src.frameWidth;
  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;

  if (!src.sequenceStartPositions) {
    // non-sequence input, copy samples directly

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
        strs(nullptr),
        frameHeight(0),
        frameWidth(0),
+        frameDepth(0),
        sequenceStartPositions(nullptr),
        subSequenceStartPositions(nullptr),
        cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
    allCount = argument.allCount;
    frameHeight = argument.frameHeight;
    frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
    dataId = argument.dataId;
  }

@@ -76,6 +75,7 @@ struct Argument {
  // A dataBatch includes batchSize frames, one frame maybe not only vector
  size_t frameHeight;
  size_t frameWidth;
+  size_t frameDepth;

  // If NULL, each position is treated independently.
  // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
  }
  size_t getFrameHeight() const { return frameHeight; }
  size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
  void setFrameHeight(size_t h) { frameHeight = h; }
  void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }

  int64_t getNumSequences() const {
    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -2,21 +2,5 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
-    sgd_op
-    gather_op
-    scatter_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op
-    lookup_table_op
-    scale_op
-    minus_op)
+    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -85,6 +85,12 @@ message ConvConfig {

  optional uint32 dilation = 15 [ default = 1 ];
  optional uint32 dilation_y = 16 [ default = 1 ];
+
+  optional uint32 filter_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
+  optional uint32 stride_z = 19 [ default = 1 ];
+  optional uint32 output_z = 20 [ default = 1 ];
+  optional uint32 img_size_z = 21 [ default = 1 ];
 }

 message PoolConfig {
@@ -127,6 +133,12 @@ message PoolConfig {

  // if not set, use padding
  optional uint32 padding_y = 13;
+
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
 }

 message SppConfig {
@@ -502,6 +514,8 @@ message LayerConfig {

  // for HuberRegressionLoss
  optional double delta = 57 [ default = 1.0 ];
+
+  optional uint64 depth = 58 [ default = 1 ];
 }

 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -8,7 +8,8 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam)
+test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
+test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
+test_conv3d_layer test_deconv3d_layer)

 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -4,6 +4,6 @@ from paddle.trainer_config_helpers import *

 data = data_layer(name="input_seq", size=128)
 scores = fc_layer(input=data, size=1, act=ExpActivation())
-kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
+kmax_seq_id = kmax_seq_score_layer(input=scores, beam_size=5)

 outputs(kmax_seq_id)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -17,3 +17,4 @@ from paddle.trainer.config_parser import parse_config_and_serialize
 if __name__ == '__main__':
    parse_config_and_serialize(
        'trainer_config_helpers/tests/layers_test_config.py', '')
+# layers_test_config.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -78,6 +78,8 @@ def init(**kwargs):

    if 'use_gpu' in kwargs:
        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    if 'use_mkldnn' in kwargs:
+        cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
                                         "supported in v2 APIs.")