diff --git a/Dockerfile b/Dockerfile
index 98f61ba586a681e53b435d592c8e43b1cc964139..136db772cc6a24b8084120fa6bab666bc1eda78e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_STYLE_CHECK
 
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 ENV HOME /root
 # Add bash enhancements
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 25c6b4ef52d3f8ebff1572ae8d348be7c577c08c..9686df00219001769d074ee815d9cc8db0258496 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index e9fd3d4bedc983ae7c544cf289dc841cf22f9de4..74f3279831357c21038df133df0f5a432a6dfd20 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
@@ -54,7 +54,8 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     PREFIX                ${MKLML_SOURCE_DIR}
     DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
+                          && tar zxf ${MKLML_VER}.tgz
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
     CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 0eeccbf7d8a1df17351c8914df6dabf005802787..0002a470d90f722e3f9106ca56d70e6bf2cea339 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -25,7 +25,12 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
+    IF(APPLE)
+        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+    ELSE()
+        SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
+    ENDIF()
 
     IF(CMAKE_CROSSCOMPILING)
         IF(ANDROID)
@@ -40,11 +45,11 @@ IF(NOT ${CBLAS_FOUND})
             SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
         ELSEIF(RPI)
             # use hardfp
-            SET(OPENBLAS_COMMIT "v0.2.19")
+            SET(OPENBLAS_COMMIT "v0.2.20")
             SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
     ELSE()
-        SET(OPENBLAS_COMMIT "v0.2.19")
+        SET(OPENBLAS_COMMIT "v0.2.20")
         SET(OPTIONAL_ARGS "")
         IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
             SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
deleted file mode 100644
index 3bf030004d4de8c6f3cb773c6e78c09f40878c5f..0000000000000000000000000000000000000000
--- a/doc/about/index_cn.md
+++ /dev/null
@@ -1,11 +0,0 @@
-关于PaddlePaddle
-================
-
-PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
-PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
-同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
-
-致谢
---------
-
-在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
deleted file mode 100644
index 065c430cdea802ed3c9f487cd00255b85a5598a5..0000000000000000000000000000000000000000
--- a/doc/about/index_en.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-ABOUT
-=======
-
-PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
-which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
-
-PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
-We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
-
-
-Credits
---------
-
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 2273c8e8698c1e2d1b02f8b2fbbf5a6e26cccf71..1329b77bb44f52c66a703740715b890c47234e72 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -419,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
-huber_cost
-----------
-..  autoclass:: paddle.v2.layer.huber_cost
+huber_regression_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+
+huber_classification_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_classification_cost
     :noindex:
 
 lambda_cost
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index a24df6c518fad84a48061ecb34ee46cb312a4995..dd9923697ab85825557aa89a08870bece7c76673 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,14 +6,12 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+PaddlePaddle提供Docker镜像来部署环境。
 
 .. toctree::
    :maxdepth: 1
    
    docker_install_cn.rst 
-   ubuntu_install_cn.rst
-
 
 
 编译流程
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 1bfd4f75c0b9b82d61d28a30f03181f7be159f24..8a53588e0439df8f4d5fd529b7a20262c67d4e58 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -8,14 +8,13 @@ Install PaddlePaddle
     :maxdepth: 1
 
     docker_install_en.rst
-    ubuntu_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
 
 ..  toctree::
     :maxdepth: 1
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
deleted file mode 100644
index 9e39ccb00f5d5655c30148900a3d76a22aacfc01..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-Ubuntu部署PaddlePaddle
-===================================
-
-PaddlePaddle提供了ubuntu 14.04 deb安装包。
-
-安装
-------
-
-安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
-
-它包含四个版本\:
-
-* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
-
-* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
-
-* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
-
-* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
-
-下载完相关安装包后，执行:
-
-..  code-block:: shell
-
-    sudo apt-get install gdebi
-    gdebi paddle-*-cpu.deb
-
-或者:
-
-..  code-block:: shell
-
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-
-
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-
-安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
-
-..  code-block:: shell
-
-    PaddlePaddle 0.8.0b1, compiled with
-        with_avx: ON
-        with_gpu: OFF
-        with_double: OFF
-        with_python: ON
-        with_rdma: OFF
-        with_timer: OFF
-        with_predict_sdk:
-
-
-可能遇到的问题
---------------
-
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-
-安装完成后，运行 :code:`paddle train` 报错\:
-
-..  code-block:: shell
-
-      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
-
-..  code-block:: shell
-
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-
diff --git a/doc/getstarted/build_and_install/ubuntu_install_en.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst
deleted file mode 100644
index ea8042085bf458be96e71017d229d88ad867695b..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-Debian Package installation guide
-=================================
-
-PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
-
-There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
-
-
-After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
-
-..	code-block:: bash
-
-	gdebi paddle-*.deb
-
-If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
-
-Or you can use following commands to install PaddlePaddle.
-
-..	code-block:: bash
-
-	dpkg -i paddle-*.deb
-	apt-get install -f
-
-And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
-
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b911f7b7509da4a147c65954acb7e7c38f489da
--- /dev/null
+++ b/doc/howto/dev/build_cn.md
@@ -0,0 +1,124 @@
+# 编译PaddlePaddle和运行单元测试
+
+## 需要的软硬件
+
+为了开发PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
+1. Docker。
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
+
+## 总体流程
+
+1. 获取源码
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. 安装开发工具到 Docker image 里
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
+
+3. 编译
+
+   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev
+   ```
+
+   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
+
+   ```bash
+   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. 运行单元测试
+
+   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
+
+   ```bash
+   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
+
+   ```bash
+   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. 清理
+
+   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
+
+   ```bash
+   rm -rf build
+   ```
+
+## 为什么要 Docker 呀？
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+## 可能碰到的问题
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0048e3714a5861a503736879d6c0870e5906c95
--- /dev/null
+++ b/doc/howto/dev/build_en.md
@@ -0,0 +1,124 @@
+# Build PaddlePaddle from Source Code and Run Unit Test
+
+## What Developers Need
+
+To contribute to PaddlePaddle, you need
+
+1. A computer -- Linux, BSD, Windows, MacOS, and
+1. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
+
+## General Process
+
+1. Retrieve source code.
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. Install build tools into a Docker image.
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
+
+3. Build from source.
+
+   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev
+   ```
+
+   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
+
+   ```bash
+   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. Run unit tests.
+
+   To run all unit tests using the first GPU of a node:
+
+   ```bash
+   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   Sometimes we want to run a specific unit test, say `memory_test`, we can run
+
+   ```bash
+   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. Clean Build.
+
+   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
+
+   ```bash
+   rm -rf build
+   ```
+
+## Docker, Or Not?
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+## Some Gotchas
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+
+- Not enough disk space
+
+  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f8da2da5a0d42ff065265c5d173d0e6167dc08a
--- /dev/null
+++ b/doc/howto/dev/new_op_cn.md
@@ -0,0 +1,324 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现C++类)
+   - [定义ProtoMaker类](#定义ProtoMaker类)
+   - [定义Operator类](#定义Operator类)
+   - [定义OpKernel类](#定义OpKernel类)
+   - [注册Operator](#注册Operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定Python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向Operator单测)
+   - [反向Operator单测](#反向Operator单测)
+   - [编译和执行](#编译和执行)
+
+
+## 概念简介
+
+简单介绍需要用到基类，详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+
+  
+ 内容            | 定义位置         
+--------------  | :----------------------  
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件 
+Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
+注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+     
+     
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 1. 定义ProtoMaker类
+
+矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
+    
+```
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+   
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+   
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
+
+在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+
+   
+再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
+   
+```
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+ 
+ 在这个例子里，两处不同：
+ 
+  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
+  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+   
+
+### 2. 定义Operator类
+
+
+```c++
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+	 
+```c++
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+   
+```c++
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```	
+	
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+
+### 3. 定义OpKernel类
+
+```C++
+template <typename Place, typename T>
+class MulKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+};
+```
+
+`MulKernel`继承自`framework::OpKernel`，带有模板参数:
+
+  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+  
+ - `typename T` : 表示数据类型，如`float`, `double`等。
+   
+`MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
+   
+注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
+   
+到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
+   
+### 4. 注册Operator
+
+在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+```
+    
+  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
+  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+
+在 `.cu`文件中注册GPU Kernel。
+   
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+```
+
+### 5. 编译
+
+在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
+   
+```
+op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+```
+   
+下面命令可以编译：
+   
+```
+make mul_op
+```
+
+## 绑定Python
+
+- 绑定Python 
+ 
+    在 [`paddle/pybind/pybind.cc 
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+
+    ```
+    USE_OP(mul);
+    ```
+    如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
+    
+    ```
+    USE_CPU_ONLY_OP(gather);
+    ```
+    
+    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
+    
+    
+ - 生成库
+
+   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
+   
+   ```
+   if(WITH_PYTHON)
+     cc_library(paddle_pybind SHARED
+     SRCS pybind.cc
+     DEPS pybind python backward
+     mul_op
+     minus_op)
+   endif(WITH_PYTHON)
+   ```
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+
+### 前向Operator单测
+
+前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+
+```
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mul"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+```
+   首先需要`import`必要的包,下面详细解释其他值：
+   
+   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
+   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
+   - `self.outputs` : 定义输出，并得到Python结算结果。
+
+ 
+### 反向Operator单测
+
+反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
+
+ ```
+ class MulGradOpTest(GradientChecker):
+    def test_mul(self):
+        op = create_op("mul")
+        inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.compare_grad(op, inputs)      
+        # mul op will enlarge the relative error
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+ ```
+
+   - 调用`create_op("mul")`创建反向Op对应的前向Op。
+   - 定义输入`inputs`。
+   - 调用`compare_grad`函数对比CPU、GPU计算结果。
+   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
+      - 第一个参数`op` : 前向op。
+      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
+      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+
+
+### 编译和执行 
+
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+
+```
+py_test(test_mul_op SRCS test_mul_op.py)
+```
+
+编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+
+```
+make test ARGS="-R test_mul_op -V"
+```
+或者:
+
+```
+ctest -R test_mul_op
+```
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 26449a6365843b526b3ac3111b337d2f17524c9d..0608aa30968b0e8474eef330e4d2cc63c9def97d 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,6 +19,7 @@
 ..  toctree::
   :maxdepth: 1
 
+  dev/build_cn.rst
   dev/write_docs_cn.rst
   dev/contribute_to_paddle_cn.md
 
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1fbfcd260b912078f00ed5b720ed607db725c4e2..1b6034be4edffd2cbc822018b733b9a3836ea84a 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,6 +18,7 @@ Development
 ..  toctree::
   :maxdepth: 1
 
+  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
 
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 168c7667c61da677905585d6c4b5037ce80b3765..64684b8b9b27e245c6b32ea28809d3bbce22fab9 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
   getstarted/index_en.rst
   howto/index_en.rst
   api/index_en.rst
-  about/index_en.rst
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 9f84db72da24b0e678520b077f9cba7ffc2d589a..6b56d9ec8d3daae96aaaa04ed79cb637331e2281 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
 /**
  * @brief   Bilinear interpolation forward.
  *
@@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index eb454c59c1e58cf2b4817b4cb3230b9d75e320ac..c7f25109972195fb56b9e96c4b68d952363e6338 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise);
 
+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[out]   dataDst     output matrix.
+ *
+ */
+extern void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input
+ * @param[in]   alpha      input
+ *
+ */
+extern void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46..a76dbf0b6578de0606702ad1af227fbf6e1cd62e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride) {}
 
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
                                 const size_t inImgW,
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 127cb7e27983e8ff2c1ff6ef5108b5f8c5bd6ca5..6ac332945c8f09fef23f35680ba5bb1d9ba9f4fd 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
 
+inline void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst) {}
+
+inline void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index aac19b1ea566ad69f1f7374e393676c8debd9883..9ba3d142617537c0160f6dccb86ddca43ada15a5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -353,6 +353,433 @@ void hl_avgpool_backward(const int frameCnt,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
+__global__ void KeMaxPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int ksizeD,
+                                   const int ksizeH,
+                                   const int ksizeW,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   real* maxPoolIdxData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + ksizeD, depth);
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    int maxIdx = -1;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (maxval < inputData[(d * height + h) * width + w]) {
+            maxval = inputData[(d * height + h) * width + w];
+            maxIdx = (d * height + h) * width + w;
+          }
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+    maxPoolIdxData[tgtIndex] = maxIdx;
+  }
+}
+
+void hl_maxpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int padD,
+                          const int padH,
+                          const int padW,
+                          real* tgtData,
+                          real* maxPoolIdxData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           pooledD,
+                                                           pooledH,
+                                                           pooledW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           padD,
+                                                           padH,
+                                                           padW,
+                                                           tgtData,
+                                                           maxPoolIdxData,
+                                                           tgtStride);
+  CHECK_SYNC("hl_maxpool3D_forward failed");
+}
+
+__global__ void KeMaxPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* targetGrad,
+                                    real* maxPoolIdxData,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width;
+    int offsetH = (index / width) % height;
+    int offsetD = (index / width / height) % depth;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart =
+        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
+    int phstart =
+        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
+    int pwstart =
+        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
+    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
+    int phend = min((offsetH + padH) / strideH + 1, pooledH);
+    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    maxPoolIdxData +=
+        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (((offsetD * height + offsetH) * width + offsetW) ==
+              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
+            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
+  }
+}
+
+void hl_maxpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           const int paddingD,
+                           const int paddingH,
+                           const int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* targetGrad,
+                           real* maxPoolIdxData,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           targetGrad,
+                                                           maxPoolIdxData,
+                                                           outStride);
+  CHECK_SYNC("hl_maxpool3D_backward");
+}
+
+__global__ void KeAvgPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int sizeZ,
+                                   const int sizeY,
+                                   const int sizeX,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + sizeZ, depth + padD);
+    int hend = min(hstart + sizeY, height + padH);
+    int wend = min(wstart + sizeX, width + padW);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dend = min(dend, depth);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
+  }
+}
+
+void hl_avgpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          inputData,
+                                                          channels,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          pooledD,
+                                                          pooledH,
+                                                          pooledW,
+                                                          sizeZ,
+                                                          sizeY,
+                                                          sizeX,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          tgtData,
+                                                          tgtStride);
+  CHECK_SYNC("hl_avgpool3D_forward failed");
+}
+
+__global__ void KeAvgPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* tgtGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * strideD - padD;
+          int hstart = ph * strideH - padH;
+          int wstart = pw * strideW - padW;
+          int dend = min(dstart + sizeZ, depth + padD);
+          int hend = min(hstart + sizeY, height + padH);
+          int wend = min(wstart + sizeX, width + padW);
+          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
+        }
+      }
+    }
+    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
+  }
+}
+
+void hl_avgpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           int paddingD,
+                           int paddingH,
+                           int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* backGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           backGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_avgpool3D_backward failed");
+}
+
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 39272456c394adc0509e60cf5972df832f7b3424..b41a3a1e06db7b2566acef19ce430645f79d486d 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -592,3 +592,204 @@ void hl_matrix_rotate(
       mat, matRot, dimM, dimN, clockWise);
   CHECK_SYNC("hl_matrix_rotate failed");
 }
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index bfda18724cc8ed23a40e0626ff07a290d26aa9d2..6b4c612cd8d9263258e3987914c44002e7bca92c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -124,6 +124,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
     std::list<Pos> insert_position;
     for (auto& dup_output_op : dup_output_ops) {
       const std::string& name = dup_output_op.first;
+      // duplicate @Empty@ don't need to be added
+      if (name == kEmptyVarName) continue;
+
       auto& dup_op = dup_output_op.second;
       // no duplicate output
       if (dup_op.size() == 1) continue;
@@ -209,7 +212,7 @@ std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size());
+  no_grad_names.reserve(no_grad_vars.size() + 1);
 
   no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
 
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
index 74c001b06a9e7b2279abf998604f2acf1b1168e4..8aa6728a95bc464ab8884986f0cec6c817d3303b 100644
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@@ -1,38 +1,82 @@
-## Operator/expression 's Backward
+# Operator/expression 's Backward
 
-### Motivation
+## Motivation
 
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+  
+## Backward Operator Registry
 
-### Implement : gradient operator registry
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
 
-|                        | forward operator | backward operator                |
-| ---------------------- | ---------------- | -------------------------------- |
-| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
-| **Operator::outputs_** | Outputs          | InputGradients                   |
+|                        | forward operator | backward operator 
+| ---------------------- | ---------------- |------------------------- |		
+| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
+| **Operator::outputs_** | Outputs          | InputGradients            |
 
-Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+ In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
 
-We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
 
-grad_op_builder(fengjiayi)
+```cpp
+REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
+```
 
-### Implement : Backward network
+`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
+
+`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
+
+## Backward Opeartor Creating
+
+Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+
+```cpp
+OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
+``` 
+
+The function `BuildGradOp` will sequentially execute following processes:
+
+1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
+
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+
+3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
+
+4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
+
+## Backward Network Building
+
+A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
+
+In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 
 
 given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
 
-1. bla bla bla (yuyang)
+1. Op 
+
+   when the input forward network is a Op, return its gradient Operator Immediately.
 
 2. NetOp 
 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+
+   <p align="center">
+   <img src="./images/duplicate_op.png" width="70%" ><br/>
+
+   1. shared variable in two operators. 
+
+   </p>
+
+   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+
+   <p align="center">
+   <img src="images/duplicate_op2.png" width="90%" ><br/>
 
-   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+   2. replace shared variable gradient with `Add` Operator
 
-   ![./images/duplicate_op]()
+   </p>
 
-    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
 
-![./images/duplicate_op2]()
 
-​	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
+​	Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 2b178907747b3911292b070b65160a24c120b726..71eac4a10b34c3010a2758120c25754af58f669d 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -19,25 +19,24 @@
 namespace paddle {
 namespace framework {
 
-LODTensor::LOD LODTensor::LOD::SliceLevels(size_t level_begin,
-                                           size_t level_end) const {
+LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
   LOD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
-    new_lod.emplace_back(at(i));
+    new_lod.emplace_back(in.at(i));
   }
   return new_lod;
 }
 
-LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
-                                            size_t elem_end) const {
+LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+                 size_t elem_end) {
   // slice the lod.
   LOD new_lod;
-  new_lod.reserve(size() - level);
-  auto start = this->at(level)[elem_begin];
-  auto end = this->at(level)[elem_end];
+  new_lod.reserve(in.size() - level);
+  auto start = in.at(level)[elem_begin];
+  auto end = in.at(level)[elem_end];
 
-  for (auto it = this->begin() + level; it != this->end(); it++) {
+  for (auto it = in.begin() + level; it != in.end(); it++) {
     auto it_begin = std::find(it->begin(), it->end(), start);
     auto it_end = std::find(it_begin, it->end(), end);
     PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
@@ -49,11 +48,11 @@ LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
                    [start](int v) { return v - start; });
     PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
   }
-  PADDLE_ENFORCE_LE(new_lod.size(), this->size());
+  PADDLE_ENFORCE_LE(new_lod.size(), in.size());
   return new_lod;
 }
 
-bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
+bool operator==(const LOD& a, const LOD& b) {
   if (a.size() != b.size()) {
     return false;
   }
@@ -70,9 +69,27 @@ bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
       }
     }
   }
-
   return true;
 }
 
+void LODTensor::SliceLevels(size_t level_begin, size_t level_end) {
+  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
+  lod_ = new_lod;
+}
+
+void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
+  lod_ = new_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9e27aec38d336db8a4f0adbed098d299aa741356..9e6b6b4aca41ed464292b56bf6f2d27514f874f7 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <memory>
-#if !defined(PADDLE_ONLY_CPU)
+#ifndef PADDLE_ONLY_CPU
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -27,33 +27,39 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_ONLY_CPU
+template <typename T>
+using Vector = std::vector<T>;
+#else
+template <typename T>
+using Vector = thrust::host_vector<T>;
+#endif
+
+using LOD = std::vector<Vector<size_t>>;
+
+LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end);
+
+LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+                 size_t elem_end);
+
+bool operator==(const LOD& a, const LOD& b);
+
 /*
  * LODTensor (Level of details Tensor)
  * see https://en.wikipedia.org/wiki/Level_of_details for reference.
  */
-class LODTensor : public Tensor {
+class LODTensor {
  public:
-// Level save offsets of each unit.
-#ifdef PADDLE_ONLY_CPU
-  template <typename T>
-  using Vector = std::vector<T>;
-#else
-  template <typename T>
-  using Vector = thrust::host_vector<T>;
-#endif
-  // LoD stores offsets of each level of units, the largest units level first,
-  // then the smaller units level. Each Level stores the offsets of units in
-  // Tesor.
-  class LOD : public std::vector<Vector<size_t>> {
-   public:
-    LOD SliceLevels(size_t level_begin, size_t level_end) const;
-    LOD SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) const;
-  };
-
   LODTensor() {}
-  explicit LODTensor(const LOD &lod) : lod_(lod) {}
+  LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
+
+  void set_lod(const LOD& lod) { lod_ = lod; }
 
-  virtual Tensor *Clone() const { return new LODTensor(lod_); }
+  void set_tensor(Tensor* tensor) { tensor_ = tensor; }
+
+  Tensor& tensor() { return *tensor_; }
+
+  LOD lod() { return lod_; }
 
   /*
    * Get a element from LOD.
@@ -79,71 +85,23 @@ class LODTensor : public Tensor {
     PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
                    NumLevels());
     // the last offset is the end of last element
-    return lod_[level].size() - 1;
+    return (lod_)[level].size() - 1;
   }
 
   /*
-   * Slice of levels[level_begin:level_end], with tensor shared.
+   * Slice of levels[level_begin:level_end]
    */
-  template <typename T>
-  LODTensor SliceLevels(size_t level_begin, size_t level_end) const;
+  void SliceLevels(size_t level_begin, size_t level_end);
 
   /*
-   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * Slice of elements of a level, [elem_begin: elem_end]
    * @note: low performance in slice lod_.
    */
-  template <typename T>
-  LODTensor SliceInLevel(size_t level, size_t elem_begin,
-                         size_t elem_end) const;
-
-  /*
-   * Copy other's lod_'s content, free to mutate.
-   */
-  void CopyLOD(const LODTensor &other) { lod_ = other.lod_; }
-  /*
-   * Determine whether LODTensor has a valid LOD info.
-   */
-  const LOD &lod() const { return lod_; }
-  LOD *mutable_lod() { return &lod_; }
-
-  virtual ~LODTensor() {}
+  void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
  private:
   LOD lod_;
+  Tensor* tensor_;  // not owned
 };
-
-bool operator==(const LODTensor::LOD &a, const LODTensor::LOD &b);
-
-template <typename T>
-LODTensor LODTensor::SliceLevels(size_t level_begin, size_t level_end) const {
-  auto new_lod = lod_.SliceLevels(level_begin, level_end);
-  // slice levels just need to update LOD info, each level will contains the
-  // whole tensor_, so no need to modify tensor_.
-  LODTensor new_tensor(new_lod);
-  new_tensor.ShareDataWith<T>(*this);
-  return new_tensor;
-}
-
-template <typename T>
-LODTensor LODTensor::SliceInLevel(size_t level, size_t elem_begin,
-                                  size_t elem_end) const {
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
-
-  auto new_lod = lod_.SliceInLevel(level, elem_begin, elem_end);
-
-  // slice elements just need to update LOD info, because offsets are not
-  // changed, so the original tensor_ can be reused.
-  LODTensor new_tensor(new_lod);
-  new_tensor.ShareDataWith<T>(*this);
-  return new_tensor;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 2881136ced6ef957a192e303e529b9b2867b3dda..9a351605edb5013bdab2c6193bdd9ce401acc937 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -24,13 +24,12 @@ namespace framework {
 class LODTensorTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
-    lod_tensor.reset(new LODTensor);
     // tensor's batch_size: 30
     // 3 levels
     // 0 10 20
     // 0 5 10 15 20
     // 0 2 5 7 10 12 15 20
-    LODTensor::LOD lod;
+    LOD lod;
     lod.push_back(std::vector<size_t>{0, 10, 20});
     lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
     lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
@@ -41,75 +40,65 @@ class LODTensorTester : public ::testing::Test {
     // malloc memory
     tensor.mutable_data<float>(place);
 
-    lod_tensor.reset(new LODTensor(lod));
-    lod_tensor->Resize({20 /*batch size*/, 128 /*dim*/});
-
-    lod_tensor->ShareDataWith<float>(tensor);
-    // lod_tensor->ShareDataWith<Tensor>(tensor);
+    lod_tensor.set_lod(lod);
+    lod_tensor.set_tensor(&tensor);
   }
 
  protected:
-  std::unique_ptr<LODTensor> lod_tensor;
   platform::CPUPlace place;
   Tensor tensor;
+  LODTensor lod_tensor;
 };
 
-TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
 
 TEST_F(LODTensorTester, NumElements) {
-  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
-  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+  ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
 }
 
 TEST_F(LODTensorTester, SliceLevels) {
   // slice 1 level
   for (size_t level = 0; level < 3UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 1);
+    LODTensor new_lod_tensor = lod_tensor;
+    new_lod_tensor.SliceLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
-    // ASSERT_EQ(new_lod_tensor, *lod_tensor);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+              lod_tensor.tensor().data<float>());
   }
   // slice 2 level
   for (size_t level = 0; level < 2UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 2);
+    LODTensor new_lod_tensor = lod_tensor;
+    new_lod_tensor.SliceLevels(level, level + 2);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor->NumElements(level + 1));
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1), lod_tensor.NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+              lod_tensor.tensor().data<float>());
   }
 }
 
 TEST_F(LODTensorTester, SliceInLevel) {
   size_t level = 0;
-  auto new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  LODTensor new_lod_tensor = lod_tensor;
+  new_lod_tensor.SliceInLevel(level, 0, 2);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
   EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
   EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
   EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
-  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+            lod_tensor.tensor().data<float>());
 
   level = 1;
-  new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  new_lod_tensor = lod_tensor;
+  new_lod_tensor.SliceInLevel(level, 0, 2);
   ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
   ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
-}
-
-TEST_F(LODTensorTester, ShareLOD) {
-  LODTensor new_lod_tensor;
-  new_lod_tensor.CopyLOD(*lod_tensor);
-  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
-}
-
-TEST_F(LODTensorTester, CopyLOD) {
-  LODTensor new_lod_tensor;
-  new_lod_tensor.CopyLOD(*lod_tensor);
-  bool equals = std::equal(lod_tensor->lod().begin(), lod_tensor->lod().end(),
-                           new_lod_tensor.lod().begin());
-  ASSERT_TRUE(equals);
+  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+            lod_tensor.tensor().data<float>());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7d7263b899afb7a2128548f264065a8013b6f0c9..7893e233b776425a61d9e3edd43d944a27743188 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
     memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
   }
+  PADDLE_ENFORCE(cudaStreamSynchronize(0),
+                 "cudaStreamSynchronize failed in Tensor CopyFrom");
 
 #endif
 }
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index c572a9d433bc16e6733b8fc9367970bef28e699a..f43f15e5cacb70b625d7791e1e02ce7780286200 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -21,6 +21,8 @@ if(USE_NNPACK)
   endif()
 endif()
 
+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
+
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@@ -42,11 +44,11 @@ if(WITH_GPU)
     add_simple_unittest(RowConvOpTest)
     add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
-    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
index f44ae0c342e9536366e2b537694cee81fcb1a6ed..d8e8c889d5c23bf9b2b5fd0b0393395883188fd8 100644
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif
 
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
+}
+
+#endif
+
 }  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 48e2e32f9256fb49c67ba25e9b5a47d72499758b..9b91e223a6a28586b11fe7ed4a44421e029a67bb 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "TensorShape.h"
 #include "TensorType.h"
+#include "neon/neon_util.h"
 
 namespace paddle {
 
@@ -93,4 +94,95 @@ public:
                   int paddingWidth);
 };
 
+template <class T>
+struct Padding {
+  static void run(const T* src,
+                  T* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+
+        memcpy(dest, src, inputWidth * sizeof(T));
+        dest += inputWidth;
+        src += inputWidth;
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+struct Padding<float> {
+  static void run(const float* src,
+                  float* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(src);
+          vst1q_f32(dest, s0);
+          src += 4;
+          dest += 4;
+        }
+        for (int r = 0; r < remain; r++) {
+          *dest++ = *src++;
+        }
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#endif
+
 }  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f09e98587d1681d29a79a9cb0303c2d4356c6935
--- /dev/null
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -0,0 +1,577 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "neon_util.h"
+#include "paddle/function/ConvOp.h"
+#include "paddle/function/Im2Col.h"
+
+namespace paddle {
+
+namespace neon {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <int filterSize, int stride>
+struct DepthwiseConvKernel {};
+
+inline float32_t conv3x3(float32x4_t r0,
+                         float32x4_t r1,
+                         float32x4_t r2,
+                         float32x4_t k0,
+                         float32x4_t k1,
+                         float32x4_t k2) {
+  float32x4_t tmp;
+  tmp = vmulq_f32(r0, k0);
+  tmp = vmlaq_f32(tmp, r1, k1);
+  tmp = vmlaq_f32(tmp, r2, k2);
+  return vaddvq_f32(tmp);
+}
+
+inline float32_t conv4x4(float32x4_t r0,
+                         float32x4_t r1,
+                         float32x4_t r2,
+                         float32x4_t r3,
+                         float32x4_t k0,
+                         float32x4_t k1,
+                         float32x4_t k2,
+                         float32x4_t k3) {
+  float32x4_t tmp;
+  tmp = vmulq_f32(r0, k0);
+  tmp = vmlaq_f32(tmp, r1, k1);
+  tmp = vmlaq_f32(tmp, r2, k2);
+  tmp = vmlaq_f32(tmp, r3, k3);
+  return vaddvq_f32(tmp);
+}
+
+/**
+ * Each step calculates four elements of the output.
+ * First step:
+ *   R0[0, 1, 2, 3...] * K[0][0]
+ *   R0[1, 2, 3, 4...] * K[0][1]
+ *   R0[2, 3, 4, 5...] * K[0][2]
+ *   R1[0, 1, 2, 3...] * K[1][0]
+ *   R1[1, 2, 3, 4...] * K[1][1]
+ *   R1[2, 3, 4, 5...] * K[1][2]
+ *   R2[0, 1, 2, 3...] * K[2][0]
+ *   R2[1, 2, 3, 4...] * K[2][1]
+ * + R2[2, 3, 4, 5...] * K[2][2]
+ * ------------------------------
+ *     Output[0, 1, 2, 3]
+ */
+template <>
+struct DepthwiseConvKernel<3, 1> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 9) {
+      // Load the filters
+      float32x4_t k[3];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 3);
+      k[2] = vld1q_f32(filterData + 6);
+      k[0] = vsetq_lane_f32(0.f, k[0], 3);
+      k[1] = vsetq_lane_f32(0.f, k[1], 3);
+      k[2] = vsetq_lane_f32(0.f, k[2], 3);
+
+      const float* r0 =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      const float* r1 = r0 + inputWidth;
+      const float* r2 = r0 + inputWidth * 2;
+      float32x4_t input[3][3];
+      for (int h = 0; h < outputHeight; h++) {
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t tmp;
+          input[0][0] = vld1q_f32(r0);
+          tmp = vld1q_f32(r0 + 4);
+          input[0][1] = vextq_f32(input[0][0], tmp, 1);
+          input[0][2] = vextq_f32(input[0][0], tmp, 2);
+          input[1][0] = vld1q_f32(r1);
+          tmp = vld1q_f32(r1 + 4);
+          input[1][1] = vextq_f32(input[1][0], tmp, 1);
+          input[1][2] = vextq_f32(input[1][0], tmp, 2);
+          input[2][0] = vld1q_f32(r2);
+          tmp = vld1q_f32(r2 + 4);
+          input[2][1] = vextq_f32(input[2][0], tmp, 1);
+          input[2][2] = vextq_f32(input[2][0], tmp, 2);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 4;
+          r1 += 4;
+          r2 += 4;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
+          r0++;
+          r1++;
+          r2++;
+          outputData++;
+        }
+
+        r0 += 2;
+        r1 += 2;
+        r2 += 2;
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ * First step:
+ *   R0[0, 2, 4, 6...] * K[0][0]
+ *   R0[1, 3, 5, 7...] * K[0][1]
+ *   R0[2, 4, 6, 8...] * K[0][2]
+ *   R1[0, 2, 4, 6...] * K[1][0]
+ *   R1[1, 3, 5, 7...] * K[1][1]
+ *   R1[2, 4, 6, 8...] * K[1][2]
+ *   R2[0, 2, 4, 6...] * K[2][0]
+ *   R2[1, 3, 5, 7...] * K[2][1]
+ *   R2[2, 4, 6, 8...] * K[2][2]
+ * ------------------------------
+ *     Output[0, 1, 2, 3]
+ */
+template <>
+struct DepthwiseConvKernel<3, 2> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 9) {
+      // Load the filters
+      float32x4_t k[3];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 3);
+      k[2] = vld1q_f32(filterData + 6);
+      k[0] = vsetq_lane_f32(0.f, k[0], 3);
+      k[1] = vsetq_lane_f32(0.f, k[1], 3);
+      k[2] = vsetq_lane_f32(0.f, k[2], 3);
+
+      const float* start =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      float32x4_t input[3][3];
+      for (int h = 0; h < outputHeight; h++) {
+        const float* r0 = start + 2 * h * inputWidth;
+        const float* r1 = start + (2 * h + 1) * inputWidth;
+        const float* r2 = start + (2 * h + 2) * inputWidth;
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t data1;
+          float32x4x2_t data2;
+
+          data2 = vld2q_f32(r0);
+          input[0][0] = data2.val[0];
+          input[0][1] = data2.val[1];
+          data1 = vld1q_f32(r0 + 8);
+          input[0][2] = vextq_f32(data2.val[0], data1, 1);
+
+          data2 = vld2q_f32(r1);
+          input[1][0] = data2.val[0];
+          input[1][1] = data2.val[1];
+          data1 = vld1q_f32(r1 + 8);
+          input[1][2] = vextq_f32(data2.val[0], data1, 1);
+
+          data2 = vld2q_f32(r2);
+          input[2][0] = data2.val[0];
+          input[2][1] = data2.val[1];
+          data1 = vld1q_f32(r2 + 8);
+          input[2][2] = vextq_f32(data2.val[0], data1, 1);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
+          r0 += 2;
+          r1 += 2;
+          r2 += 2;
+          outputData++;
+        }
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ */
+template <>
+struct DepthwiseConvKernel<4, 1> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 16) {
+      // Load the filters
+      float32x4_t k[4];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 4);
+      k[2] = vld1q_f32(filterData + 8);
+      k[3] = vld1q_f32(filterData + 12);
+
+      const float* r0 =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      const float* r1 = r0 + inputWidth;
+      const float* r2 = r0 + inputWidth * 2;
+      const float* r3 = r0 + inputWidth * 3;
+      float32x4_t input[4][4];
+      for (int h = 0; h < outputHeight; h++) {
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t tmp;
+          input[0][0] = vld1q_f32(r0);
+          tmp = vld1q_f32(r0 + 4);
+          input[0][1] = vextq_f32(input[0][0], tmp, 1);
+          input[0][2] = vextq_f32(input[0][0], tmp, 2);
+          input[0][3] = vextq_f32(input[0][0], tmp, 3);
+
+          input[1][0] = vld1q_f32(r1);
+          tmp = vld1q_f32(r1 + 4);
+          input[1][1] = vextq_f32(input[1][0], tmp, 1);
+          input[1][2] = vextq_f32(input[1][0], tmp, 2);
+          input[1][3] = vextq_f32(input[1][0], tmp, 3);
+
+          input[2][0] = vld1q_f32(r2);
+          tmp = vld1q_f32(r2 + 4);
+          input[2][1] = vextq_f32(input[2][0], tmp, 1);
+          input[2][2] = vextq_f32(input[2][0], tmp, 2);
+          input[2][3] = vextq_f32(input[2][0], tmp, 3);
+
+          input[3][0] = vld1q_f32(r3);
+          tmp = vld1q_f32(r3 + 4);
+          input[3][1] = vextq_f32(input[3][0], tmp, 1);
+          input[3][2] = vextq_f32(input[3][0], tmp, 2);
+          input[3][3] = vextq_f32(input[3][0], tmp, 3);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 4;
+          r1 += 4;
+          r2 += 4;
+          r3 += 4;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          float32x4_t i3 = vld1q_f32(r3);
+          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
+          r0++;
+          r1++;
+          r2++;
+          r3++;
+          outputData++;
+        }
+
+        r0 += 3;
+        r1 += 3;
+        r2 += 3;
+        r3 += 3;
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ */
+template <>
+struct DepthwiseConvKernel<4, 2> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 16) {
+      // Load the filters
+      float32x4_t k[4];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 4);
+      k[2] = vld1q_f32(filterData + 8);
+      k[3] = vld1q_f32(filterData + 12);
+
+      const float* start =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      float32x4_t input[4][4];
+      for (int h = 0; h < outputHeight; h++) {
+        const float* r0 = start + 2 * h * inputWidth;
+        const float* r1 = start + (2 * h + 1) * inputWidth;
+        const float* r2 = start + (2 * h + 2) * inputWidth;
+        const float* r3 = start + (2 * h + 3) * inputWidth;
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4x2_t data1;
+          float32x4x2_t data2;
+
+          data1 = vld2q_f32(r0);
+          data2 = vld2q_f32(r0 + 8);
+          input[0][0] = data1.val[0];
+          input[0][1] = data1.val[1];
+          input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r1);
+          data2 = vld2q_f32(r1 + 8);
+          input[1][0] = data1.val[0];
+          input[1][1] = data1.val[1];
+          input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r2);
+          data2 = vld2q_f32(r2 + 8);
+          input[2][0] = data1.val[0];
+          input[2][1] = data1.val[1];
+          input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r3);
+          data2 = vld2q_f32(r3 + 8);
+          input[3][0] = data1.val[0];
+          input[3][1] = data1.val[1];
+          input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          r3 += 8;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          float32x4_t i3 = vld1q_f32(r3);
+          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
+          r0 += 2;
+          r1 += 2;
+          r2 += 2;
+          r3 += 2;
+          outputData++;
+        }
+      }
+    }
+  }
+};
+
+template <DeviceType Device>
+class NeonDepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input
+    float* inputPadding = inputData;
+    if (paddingH() > 0 || paddingW() > 0) {
+      int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) *
+                    (inputWidth + 2 * paddingW());
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      Padding<float>::run(inputData,
+                          inputPadding,
+                          batchSize * inputChannels,
+                          inputHeight,
+                          inputWidth,
+                          paddingH(),
+                          paddingW());
+
+      // height and width of padding data
+      inputHeight += 2 * paddingH();
+      inputWidth += 2 * paddingW();
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3 && strideW() == 1) {
+      DepthWiseConv = DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 3 && strideW() == 2) {
+      DepthWiseConv = DepthwiseConvKernel<3, 2>::run;
+    } else if (filterWidth == 4 && strideW() == 1) {
+      DepthWiseConv = DepthwiseConvKernel<4, 1>::run;
+    } else if (filterWidth == 4 && strideW() == 2) {
+      DepthWiseConv = DepthwiseConvKernel<4, 2>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (size_t i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    inputHeight,
+                    inputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
+
+#endif
+
+}  // namespace neon
+}  // namespace paddle
diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..56b3febe2d27bb4fbf57e49079b3ad071d556914
--- /dev/null
+++ b/paddle/function/neon/neon_util.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include <arm_neon.h>
+
+namespace paddle {
+
+namespace neon {
+
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32(
+      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+
+#ifndef __aarch64__
+inline float32_t vaddvq_f32(float32x4_t a) {
+  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+                                   float32x4_t b,
+                                   float32x4_t v,
+                                   const int lane) {
+  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
+}
+#endif
+
+}  // namespace neon
+}  // namespace paddle
+
+#endif
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7cc9937cce37cbbc4640fbb88312841c23b757c0
--- /dev/null
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr outGradMat = getOutputGrad();
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b622508d0ce1b0938c44f5c7f1371a34c86b2c1d
--- /dev/null
+++ b/paddle/gserver/layers/Conv3DLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index a5328ef8343e1050352fc48530e041fb6ce12a8b..b848ab6bdd44f8fe81cbbf63b35a321599fd93fe 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -38,7 +38,6 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     strideY_.push_back(conf.stride_y());
     dilationY_.push_back(conf.dilation_y());
     filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
     imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                               : conf.img_size());
@@ -47,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterChannels_.push_back(conf.filter_channels());
     outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
     outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
   }
 
   CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
 
-  /* initialize the biases_ */
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
+  // create new weights_ in derived class
+  // create new biases_ in derived class
 
   // default caffe model
   caffeMode_ = true;
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index 223bce8e296d748c8e17eb105aa67e8a1c1219b6..ccd170d9d85f573dff7340c26b2038c17a548471 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -62,6 +62,13 @@ protected:
   IntV outputH_;
   /// The spatial dimensions of output feature map width.
   IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
   /// filters are only connected to the first half of the input channels,
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 6bfdea3c6e3f7cb80b620564f8229d954d773f04..ce071323ff585d28c9eaf80fec9be2394be526d1 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
   }
 }
 
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber, HuberTwoClass);
-
-bool HuberTwoClass::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
   CostLayer::init(layerMap, parameterMap);
   if (useGpu_) {
     tmpCpuInput_.reserve(inputLayers_.size());
@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
   return true;
 }
 
-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       tmpCpuInput_[i].resizeAndCopyFrom(
@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
     }
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
-  forwardImpIn(output, label, cost);
 }
 
-void HuberTwoClass::forwardImpIn(Matrix& output,
-                                 Argument& label,
-                                 Matrix& target) {
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
   size_t numSamples = target.getHeight();
+  CHECK(label.ids);
   CHECK_EQ((*label.ids).getSize(), numSamples);
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), (size_t)1);
@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,
 
   real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
   int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples);
+  std::vector<real> cost(numSamples, 0);
   for (size_t i = 0; i < numSamples; ++i) {
     int y = 2 * lbl[i] - 1;
-    if (out[i] * y < -1)
-      cost[i] = -4 * out[i] * y;
-    else if (out[i] * y < 1)
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
-    else
-      cost[i] = 0;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
   }
   target.copyFrom(cost.data(), numSamples);
 }
 
-void HuberTwoClass::backwardImp(Matrix& outputValue,
-                                Argument& label,
-                                Matrix& outputGrad) {
-  if (useGpu_) {
-    backwardImpIn(
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
-  } else {
-    backwardImpIn(outputValue, label, outputGrad);
-  }
-}
-
-void HuberTwoClass::backwardImpIn(Matrix& output,
-                                  Argument& label,
-                                  Matrix& outputG) {
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
   size_t numSamples = output.getHeight();
-  real* out = output.getData();
-  real* grad = outputG.getData();
-  int* lbl = (*label.ids).getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
   for (size_t i = 0; i < numSamples; ++i) {
     int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
+    if (a < -1)
       grad[i] += -4 * y;
-    else if (y * out[i] < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
   }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
-
 /**
  * This cost layer compute the sum of its input as loss.
  * \f[
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 14c0b33ec1a628521ae2d694dda8da553c29fd38..0f655b48eea051c41ce17c0a41189b26188cc866 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -304,37 +304,70 @@ public:
                    Matrix& outputGrad) override;
 };
 
-/**
- * Huber loss for robust 2-classes classification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * \f[
- * Loss =
- * \left\{\begin{matrix}
- *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
- *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
- *  0             &                    \textit{otherwise}
- * \end{matrix}\right.
- * \f]
+/*
+ * A base layer for HuberRegressionLoss and HuberTwoClassification.
  */
-class HuberTwoClass : public CostLayer {
+class HuberCost : public CostLayer {
+public:
   std::vector<Argument> tmpCpuInput_;
 
-public:
-  explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
+  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
 
   void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override {}
+};
+
+/**
+ * Huber loss for robust regression.
+ *
+ * Given output f(x), label y and delta, the loss is:
+ * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
+ * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
+ */
+class HuberRegressionLoss : public HuberCost {
+public:
+  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
   void backwardImp(Matrix& outputValue,
                    Argument& label,
                    Matrix& outputGrad) override;
 
-  void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+protected:
+  real delta_;
+};
+
+/**
+ * Huber loss for robust 2-classes classification.
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
+ * Loss = 4 * y * f, if y* f < -1 \\
+ * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
+ * Loss = 0, otherwise
+ */
+class HuberTwoClassification : public HuberCost {
+public:
+  explicit HuberTwoClassification(const LayerConfig& config)
+      : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 typedef std::shared_ptr<CostLayer> CostLayerPtr;
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..578bdbbe72120abccc63ed13d11e1dec65d41e44
--- /dev/null
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossEntropyOverBeam.h"
+
+namespace paddle {
+
+void CostForOneSequence::calValidExpandStep() {
+  validExpansionCount_ = 0;
+  goldAsExtraPath_ = true;
+
+  for (size_t i = 0; i < beams_->expansionCount; ++i) {
+    real gold = static_cast<real>(beams_->gold[i]);
+    if (i) {
+      real* start = beams_->candidateIds[i - 1]->getData();
+      goldRowIds_[i] = std::count_if(
+          start,
+          start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
+          [](const real& val) { return val != -1.; });
+    } else {
+      goldRowIds_[i] = 0;
+    }
+
+    real* start =
+        beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
+    real* findEnd = std::find(start, start + beamSize_, gold);
+    validExpansionCount_++;
+
+    if (start + beamSize_ == findEnd) return;
+    goldColIds_[i] = findEnd - start;
+  }
+  if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
+}
+
+size_t CostForOneSequence::initLastExpansion() {
+  int beamId = validExpansionCount_ - 1;
+  const MatrixPtr candidates = beams_->candidateIds[beamId];
+  size_t height = candidates->getHeight();
+
+  /* initialization the last expansion. */
+  size_t pathCount = std::count_if(candidates->getData(),
+                                   candidates->getData() + height * beamSize_,
+                                   [](const real& val) { return val != -1; });
+  /*
+   * if the gold sequence falls off the beam during search, add the gold
+   * sequence as the last path into the all expanded candidates.
+   */
+  if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
+
+  pathRowIdsInEachBeam_.clear();
+  pathRowIdsInEachBeam_.resize(validExpansionCount_,
+                               std::vector<int>(pathCount, 0));
+  parentIdsInBeam_.clear();
+  parentIdsInBeam_.resize(pathCount, 0);
+
+  if (goldAsExtraPath_) {
+    /* add gold sequence into the total expansion. */
+    pathRowIdsInEachBeam_[beamId].back() =
+        beams_->gold[beamId] +
+        getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
+    parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
+  } else {
+    size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
+    goldIdsInFinalExpansion_ =
+        std::count_if(candidates->getData(),
+                      candidates->getData() + goldOffset,
+                      [](const real& val) { return val != -1.; });
+  }
+
+  /*
+   * TODO(caoying): fix this, store the indices of selected candidate
+   * paths into Argument.ids
+   */
+  real* ids = candidates->getData();
+  size_t curIdx = 0;
+  for (size_t i = 0; i < height; ++i) {
+    int basePos = getSeqStartPos(beamId, i);
+    for (size_t j = 0; j < beamSize_; ++j) {
+      int id = ids[i * beamSize_ + j];
+      if (id == -1) continue;
+      pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
+      parentIdsInBeam_[curIdx++] = i;
+    }
+  }
+  return pathCount;
+}
+
+void CostForOneSequence::constructTotalExpansion() {
+  /*
+   * construct the entire expanded beam by begining with the last search
+   * in which gold falls off the beam.
+   */
+  size_t totalPathCount = initLastExpansion();
+
+  for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
+    const MatrixPtr candidates = beams_->candidateIds[beamId];
+    real* ids = candidates->getData();
+
+    int lastParentIdInBeam = -1;
+    int basePos = -1;
+    for (size_t i = 0;
+         i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
+         ++i) {
+      int id = ids[parentIdsInBeam_[i]];
+      int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
+      if (parentIdsInBeam_[i] != lastParentIdInBeam)
+        basePos = getSeqStartPos(beamId, parentRowId);
+
+      pathRowIdsInEachBeam_[beamId][i] = id + basePos;
+      lastParentIdInBeam = parentIdsInBeam_[i];
+      parentIdsInBeam_[i] = parentRowId;
+
+      if (goldAsExtraPath_)
+        pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
+            beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
+    }
+  }
+}
+
+real CostForOneSequence::globallyNormalizedScore() {
+  expandedPathScores_.resize(validExpansionCount_);
+
+  Matrix::resizeOrCreate(
+      softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
+  softmaxOut_->zeroMem();
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    Matrix::resizeOrCreate(expandedPathScores_[i],
+                           pathRowIdsInEachBeam_[i].size(),
+                           1,
+                           false,
+                           false);
+    expandedPathScores_[i]->zeroMem();
+
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
+    tmp->add(*expandedPathScores_[i]);
+  }
+
+  softmaxOut_->softmax(*softmaxOut_);
+  return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
+}
+
+real CostForOneSequence::forward() {
+  calValidExpandStep();
+  constructTotalExpansion();
+  return globallyNormalizedScore();
+}
+
+void CostForOneSequence::backward() {
+  /*
+   * when softmax layer is the output layer, and it is combined with
+   * cross-entropy as cost. The derivate with regard to softmax's input
+   * is simply:
+   *
+   * grad_i = softmax_out_i - target_i,
+   *
+   * and here hard label is used.
+   */
+  softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
+
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    /*
+      beams_->scoreGrad[i] has been intialized outside this class, this
+      class only keeps a pointer pointing to the original input gradients,
+      so here does not need to allocate or initalize the memory.
+    */
+    tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
+  }
+}
+
+REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
+
+bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
+
+  beamExpanCount_ = inputLayers_.size() / 3;
+
+  candidateScores_.resize(beamExpanCount_);
+  candidateScoreGrad_.resize(beamExpanCount_);
+
+  candidateInBeam_.resize(beamExpanCount_);
+  goldSequence_.resize(beamExpanCount_);
+  gradToInputs_.resize(beamExpanCount_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void CrossEntropyOverBeam::checkInputs() {
+  batchSize_ = 0;
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    const Argument& scores = getInput(i * 3);
+    const Argument& selCandidates = getInput(i * 3 + 1);
+    const Argument& goldSeq = getInput(i * 3 + 2);
+
+    if (i) {
+      CHECK(scores.hasSubseq()) << "input " << i << " "
+                                << inputLayers_[i * 3]->getName()
+                                << " should be a nested sequence";
+      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
+      CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
+      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
+    } else {
+      CHECK(scores.hasSeq()) << "input " << i << " "
+                             << inputLayers_[i]->getName()
+                             << " should be a sequence";
+      batchSize_ = scores.getNumSequences();
+      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
+      CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
+    }
+    CHECK_EQ(1U, scores.value->getWidth());
+    CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
+  }
+}
+
+void CrossEntropyOverBeam::copyInputsToCpu() {
+  auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
+    if (dynamic_cast<GpuMatrix*>(src.get())) {
+      Matrix::resizeOrCreate(
+          trg, src->getHeight(), src->getWidth(), false, false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+
+  auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
+    if (dynamic_cast<GpuIVector*>(src.get())) {
+      IVector::resizeOrCreate(trg, src->getSize(), false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+
+  beamSplitPos_.clear();
+  beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    copyValue(getInputValue(i * 3), candidateScores_[i]);
+    copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
+    copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
+
+    if (i) {
+      ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
+      const int* seqStarts = seqInfo->getMutableData(false);
+      ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
+      const int* subSeqStarts = subSeqInfo->getMutableData(false);
+
+      size_t seqId = 1;
+      for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
+           ++subSeqId) {
+        CHECK_LT(seqId, seqInfo->getSize());
+        if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
+          beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
+          seqId++;
+        }
+        beamSplitPos_[seqId - 1][i]++;
+      }
+    } else {
+      for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
+    }
+  }
+}
+
+void CrossEntropyOverBeam::splitBatchBeams() {
+  beamCosts_.resize(batchSize_);
+  beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
+
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    int* seqStarts =
+        getInput(i * 3).sequenceStartPositions->getMutableData(false);
+
+    int* subSeqStarts = nullptr;
+    int maxLen = 0;
+    if (i) {
+      subSeqStarts =
+          getInput(i * 3).subSequenceStartPositions->getMutableData(false);
+      maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
+    } else {
+      maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
+    }
+
+    for (size_t j = 0; j < batchSize_; ++j) {
+      beamPerSeq_[j].scores[i] =
+          Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+      beamPerSeq_[j].scoreGrad[i] =
+          Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+
+      int offset = j ? beamSplitPos_[j - 1][i] : 0;
+      int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
+      CHECK_GE(maxLen, offset + height);
+      beamPerSeq_[j].seqInfo[i] = IVector::create(
+          (i ? subSeqStarts : seqStarts) + offset, height + 1, false);
+
+      beamPerSeq_[j].candidateIds[i] =
+          Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
+                         height,
+                         beamSize_,
+                         false,
+                         false);
+      beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
+
+      CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
+    }
+  }
+}
+
+void CrossEntropyOverBeam::resizeOutput() {
+  Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
+  output_.value->zeroMem();
+
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    MatrixPtr inGrad = getInputGrad(i * 3);
+    if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
+      Matrix::resizeOrCreate(candidateScoreGrad_[i],
+                             inGrad->getHeight(),
+                             inGrad->getWidth(),
+                             false,
+                             false);
+    } else {
+      candidateScoreGrad_[i] = std::move(inGrad);
+    }
+    candidateScoreGrad_[i]->zeroMem();
+  }
+}
+
+void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
+      getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
+
+    if (i == copyCount - 1) break;
+  }
+}
+
+void CrossEntropyOverBeam::forward(PassType passType) {
+  Layer::forward(passType);
+
+  checkInputs();
+  copyInputsToCpu();
+
+  resizeOutput();
+  splitBatchBeams();
+
+  MatrixPtr outputValue = getOutputValue();
+  for (size_t i = 0; i < batchSize_; ++i) {
+    BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
+    beamCosts_[i].setData(std::move(ptr), beamSize_);
+    outputValue->getData()[i] = beamCosts_[i].forward();
+  }
+}
+
+void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < batchSize_; ++i) {
+    beamCosts_[i].backward();
+    copyGradToGpu(beamCosts_[i].getValidExpansionCount());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
new file mode 100644
index 0000000000000000000000000000000000000000..5643556f43370912a730d9895658d8944f50dced
--- /dev/null
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CrossEntropyOverBeam.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/* This struct stores the beams in all search steps for a single sequence. */
+struct BeamExpansion {
+  std::vector<MatrixPtr> scores;
+  std::vector<IVectorPtr> seqInfo;
+
+  std::vector<MatrixPtr> candidateIds;
+  std::vector<int> gold;
+
+  std::vector<MatrixPtr> scoreGrad;
+
+  size_t expansionCount;
+
+  explicit BeamExpansion(int n) {
+    expansionCount = n;
+    scores.resize(expansionCount);
+    seqInfo.resize(expansionCount);
+    candidateIds.resize(expansionCount);
+    scoreGrad.resize(expansionCount);
+
+    gold.resize(expansionCount);
+  }
+};
+typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
+
+class CostForOneSequence {
+public:
+  CostForOneSequence()
+      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
+  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
+    beams_ = bPtr;
+    beamSize_ = beamSize;
+
+    expandedPathScores_.clear();
+    expandedPathScores_.resize(beams_->expansionCount);
+
+    goldRowIds_.clear();
+    goldRowIds_.resize(beams_->expansionCount, 0);
+    goldColIds_.clear();
+    goldColIds_.resize(beams_->expansionCount, -1);
+  }
+  size_t getValidExpansionCount() { return validExpansionCount_; }
+
+  real forward();
+  void backward();
+
+private:
+  void calValidExpandStep();
+  void constructTotalExpansion();
+  size_t initLastExpansion();
+  real globallyNormalizedScore();
+
+  int getSeqStartPos(size_t beamId, size_t rowId) {
+    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
+    int* starts = beams_->seqInfo[beamId]->getData();
+    return starts[rowId] - starts[0];
+  }
+
+  size_t beamSize_;
+  size_t validExpansionCount_;
+  bool goldAsExtraPath_;
+  std::vector<int> goldRowIds_;
+  std::vector<int> goldColIds_;
+
+  BeamExpansionPtr beams_;
+  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
+  std::vector<int> parentIdsInBeam_;
+  size_t goldIdsInFinalExpansion_;
+
+  std::vector<MatrixPtr> expandedPathScores_;
+
+  MatrixPtr softmaxOut_;
+};
+
+class CrossEntropyOverBeam : public Layer {
+public:
+  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+private:
+  void checkInputs();
+  void copyInputsToCpu();
+  void resizeOutput();
+  void copyGradToGpu(size_t copyCount);
+  void splitBatchBeams();
+
+  size_t beamExpanCount_;
+  size_t batchSize_;
+  size_t beamSize_;
+
+  /*
+   * the process of constructing beams is not friendly to GPU, currently, this
+   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
+   * it to CPU memory.
+   */
+  std::vector<MatrixPtr> candidateScores_;
+  std::vector<MatrixPtr> candidateScoreGrad_;
+  std::vector<MatrixPtr> candidateInBeam_;
+  std::vector<MatrixPtr> gradToInputs_;
+  std::vector<IVectorPtr> goldSequence_;
+  std::vector<std::vector<int>> beamSplitPos_;
+
+  /*
+   * split entire bath of beams into beam per sequnence and store the result
+   * into this member.
+   */
+  std::vector<BeamExpansion> beamPerSeq_;
+  /* beamCosts_ is used to propagate error in one sequence. */
+  std::vector<CostForOneSequence> beamCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
index c056bbe4d1d354751d4f85f8d0743cf30486c087..9e954615cddf2566ea336d1c947985fd916e8cc4 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -46,8 +46,26 @@ bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
     projConf_.emplace_back(conf);
     projections_.emplace_back(
         Projection::create(*projConf_[i], parameters_[i], useGpu_));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
   }
 
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
     hl_create_tensor_descriptor(&outputDesc_);
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d5c772c89d260264a59f4cc4439bb8a44c605a4
--- /dev/null
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DeConv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(deconv3d, DeConv3DLayer);
+
+bool DeConv3DLayer::init(const LayerMap &layerMap,
+                         const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  // for Deconv, the dimension of Kernel is
+  // channel * output * depth * height * weigth
+  // Matrix storage format: (output * depth * height * weigth) x  channel
+  for (int index = 0; index < config_.inputs().size(); ++index) {
+    M_.push_back(filterChannels_[index]);
+    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * numFilters_;
+    width = filterChannels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t DeConv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  NOut_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(
+        imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(imageSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(imageSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += NOut_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void DeConv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    MatrixPtr wMat = weights_[i]->getW();
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    for (int n = 0; n < batchSize; ++n) {
+      real *inData = inMat->getData() + n * inMat->getStride();
+      for (int g = 0; g < groups_[i]; ++g) {
+        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
+        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
+        inData += M * N;
+      }
+      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
+                       numFilters_,
+                       outputD_[i],
+                       outputH_[i],
+                       outputW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i],
+                       1.0,
+                       1.0);
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void DeConv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  int batchSize = getOutputGrad()->getHeight();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad() || this->needGradient_) {
+      int M = M_[i];
+      int N = N_[i];
+      int K = K_[i];
+      REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
+      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+      const MatrixPtr &inMat = getInputValue(i);
+      for (int n = 0; n < batchSize; ++n) {
+        colBuf_->vol2Col(
+            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
+            numFilters_,
+            outputD_[i],
+            outputH_[i],
+            outputW_[i],
+            filterSizeZ_[i],
+            filterSizeY_[i],
+            filterSize_[i],
+            strideZ_[i],
+            strideY_[i],
+            stride_[i],
+            paddingZ_[i],
+            paddingY_[i],
+            padding_[i]);
+        if (weights_[i]->getWGrad()) {
+          real *inData = inMat->getData() + n * inMat->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+            MatrixPtr wGradMatSub =
+                weights_[i]->getWGrad()->subMatrix(g * K, K);
+            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+            wGradMatSub->mul(
+                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
+            inData += M * N;
+          }
+        }
+        if (getInputGrad(i)) {
+          real *preGrad =
+              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
+            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
+            MatrixPtr inGradMatSub =
+                Matrix::create(preGrad, M, N, false, useGpu_);
+            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
+            preGrad += M * N;
+          }
+        }
+      }
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+void DeConv3DLayer::bpropWeights(int i) {}
+void DeConv3DLayer::bpropData(int i) {}
+
+void DeConv3DLayer::bpropBiases() {
+  const MatrixPtr &outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void DeConv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2a3d3f8273ed77065224c27df6f711f09f34bbc
--- /dev/null
+++ b/paddle/gserver/layers/DeConv3DLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 77736e78f9349c0393e1e53ac700817a70893e53..2b7bef0a757d7c706be3815c539b036b094596cf 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,12 +22,31 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
   /* Initialize the basic convolutional parent class */
   ConvBaseLayer::init(layerMap, parameterMap);
 
+  int index = 0;
   for (auto &inputConfig : config_.inputs()) {
     const ConvConfig &conf = inputConfig.conv_conf();
     /* Consistent caffe mode for multiple input */
     caffeMode_ = conf.caffe_mode();
-  }
 
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
   getOutputSize();
 
   return true;
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 0ece2799318ea5ecc91f97f71289d4d07246dcaa..20de475fc3f6b6f3c05ac26bea8363daff0cf110 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -29,6 +29,10 @@ namespace paddle {
 REGISTER_LAYER(exconv, ExpandConvLayer);
 REGISTER_LAYER(exconvt, ExpandConvLayer);
 
+inline bool isDepthwiseConv(int channels, int groups) {
+  return channels == groups;
+}
+
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                            const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
@@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
 
-    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+    // Convolution Layer uses the GemmConv function by default.
+    convType = "GemmConv";
+    convGradInputType = "GemmConvGradInput";
+    convGradFilterType = "GemmConvGradFilter";
+
+    // If depth wise convolution and useGpu == true
+    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
       convType = "DepthwiseConv";
       convGradInputType = "DepthwiseConvGradInput";
       convGradFilterType = "DepthwiseConvGradFilter";
-    } else {
-      convType = "GemmConv";
-      convGradInputType = "GemmConvGradInput";
-      convGradFilterType = "GemmConvGradFilter";
+    }
+
+    // If depth wise convolution and useGpu == false and ARM-NEON
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      if ((filterSize_[i] == filterSizeY_[i]) &&
+          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+        convType = "NeonDepthwiseConv";
+      }
+#endif
     }
 
     if (FLAGS_use_nnpack && !isDeconv_) {
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index d5621412caee843e24a0d0c9b7096402765738c7..2bc20eee6c452d0943dbf43b17ebe77976c97489 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -41,7 +41,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
     : config_(config),
       useGpu_(useGpu),
-      deviceId_(-1),
+      deviceId_(CPU_DEVICE),
       needSequenceInfo_(true) {}
 
 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 0ed482889d0cea884db3759620088575c5b10201..edef36194aabdb9c122ec3423deb036169a34d7c 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -59,7 +59,12 @@ protected:
   LayerConfig config_;
   /// whether to use GPU
   bool useGpu_;
-  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  /// Paddle device ID, MKLDNN is -2, CPU is -1
+  enum PADDLE_DEVICE_ID {
+    MKLDNN_DEVICE = -2,
+    CPU_DEVICE = -1,
+  };
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
   int deviceId_;
   /// Input layers
   std::vector<LayerPtr> inputLayers_;
@@ -77,6 +82,7 @@ protected:
   Argument output_;
   /// Several outputs stored on different devices, used in 'parallel_nn' case,
   /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
   std::map<std::string, Argument*> outputMap_;
@@ -172,6 +178,13 @@ protected:
     return inputLayer.getOutput(deviceId_);
   }
 
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
   /**
    * Get the forward-input value.
    */
@@ -186,6 +199,13 @@ protected:
     return inputLayer.getOutput(deviceId_).value;
   }
 
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
   /**
    * Get the forward-input grad.
    */
@@ -200,6 +220,13 @@ protected:
     return inputLayer.getOutput(deviceId_).grad;
   }
 
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
   /**
    * Get the forward-input label.
    */
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d201fac65e0459050304195140e1aae081468f43..8318c8c519a4cec1610eadd28320ee5ce0b4147d 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
     return;
   }
 
-  // TODO(TJ): dst format should get from wgtVal_
-  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
-  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-
-  // The weight_ is transposed from initial paddle weight
-  MatrixPtr paddleWgt = Matrix::create(
-      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
-
-  // TODO(TJ): remove this print when do not need differ weights
-  std::ostringstream ostr;
-  paddleWgt->print(ostr);
-  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
-
-  // The mkldnn weight is transposed from initial paddle matrix
-  MatrixPtr paddleWgtT;
-  paddleWgt->transpose(paddleWgtT, true);
-  weight_->getW()->copyFrom(*paddleWgtT);
-  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
 
 void MKLDNNFcLayer::convertWeightsToPaddle() {
-  MatrixPtr dnnWgt = weight_->getW();
-  MatrixPtr paddleWgt;
-  dnnWgt->transpose(paddleWgt, true);
-
-  // copy paddle weight and override on weight_
-  MatrixPtr dnnWgtT = Matrix::create(
-      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
-  dnnWgtT->copyFrom(*paddleWgt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
 }
 
 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
   if (bs_ == batchSize) {
     return;
@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
   if (iw_ == 0) {
     iw_ = 1;
   }
-  hasSpatial_ = true;
-  if (ih_ == 1 && iw_ == 1) {
-    hasSpatial_ = false;
-  }
   CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
   ic_ = iLayerSize_ / (ih_ * iw_);
   CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
 
 void MKLDNNFcLayer::resetFwd() {
   bool hasBias = biases_ && biases_->getW();
-  real* iData = getInputValue(0)->getData();
-  real* oData = getOutputValue()->getData();
-  real* wData = weight_->getW()->getData();
-  real* bData = hasBias ? biases_->getW()->getData() : NULL;
-
-  // TODO(TJ): below create should be covered in MkldnnMatrix
-  // create memory desc
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
-
-  // create memory primitive desc and memory self
-  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
-  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+  const MatrixPtr& wgt = weight_->getW();
+  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
+  const MatrixPtr& out = output_.value;
+
+  if (inputIsOnlyMKLDNN()) {
+    const MatrixPtr& in = getInputValue(0);
+    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
+    CHECK(inVal_) << "Input should be MKLDNNMatrix";
+  } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
+    inVal_ = MKLDNNMatrix::create(
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  }
+  inVal_->downSpatial();
+  wgtVal_ = MKLDNNMatrix::create(
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+  wgtVal_->downSpatial();
+  biasVal_ =
+      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+  outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+
+  // change original output value to mkldnn output value
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!outputIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
+  }
 
+  // create forward handle
   prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
-                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-
-  if (bData != NULL) {
-    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+  if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
   } else {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
   }
+  printValueFormatFlow();
+
   pipelineFwd_.clear();
   pipelineFwd_.push_back(*fwd_);
 }
@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
     return;
   }
   needResetBwd_ = false;
-
   bool hasBias = biases_ && biases_->getWGrad();
-  real* iData = getInputValue(0)->getData();
-  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
-  real* oDiff = getOutputGrad()->getData();
-  real* wDiff = weight_->getWGrad()->getData();
-  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
 
   /// backward weight
-  // create memory desc for backward memory
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
-  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-
-  if (inVal_) {
-    // update data
-    inVal_->set_data_handle(iData);
-  } else {
-    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  }
-
-  // create memory primitive desc and memory self
-  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
-  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
-
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  CHECK(inVal_) << "Should have input value";
+  const MatrixPtr& wgt = weight_->getWGrad();
+  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
+
+  // TODO(TJ): merge outgrad
+  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  // for MKLDNN device:
+  // can not directly cast outputgrad to mkldnnmatrix,
+  // since each layer can not write the inputgrad to mkldnn inputgrad.
+  // So just create from matrix with outputvalue format.
+  // for CPU device:
+  // fc do not need to convert from cpu device since output is always nc format
+  // only need create from cpu device
+  const MatrixPtr& out = getOutput(device).grad;
+  outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+                      : nullptr;
+
+  // create memory primitive desc
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
-                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
-                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
   fc_bwdWgt::primitive_desc bwdWgtPD =
       fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
 
-  if (bDiff != NULL) {
-    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+  if (hasBias) {
     bwdWgt_.reset(
         new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
   } else {
@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  if (iDiff == NULL) {
+  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
     return;
   }
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  } else {
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
+  }
+
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+
   CHECK(wgtVal_) << "Should have weight memory";
   bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
   pipelineBwd_.push_back(*bwdData_);
 }
 
@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->set_data_handle(iData);
+    syncInputValue();
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
     resetBwd();
 
-    // update diff
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->set_data_handle(oDiff);
-
+    syncOutputGrad();
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
   }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 7954852a23f81d36d5fb0ae6a19768f419886fb1..e138a6faf181c412949218458e7ecf800a0d6a07 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -32,16 +32,13 @@ protected:
   // if has already init the weight
   bool hasInitedWgt_;
 
-  // if input layer has image size info (ih>1 && iw>1)
-  bool hasSpatial_;
-
   // fc weight and bias
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
 public:
   explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}
 
   ~MKLDNNFcLayer() {}
 
@@ -75,6 +72,8 @@ protected:
    * only would be called when needed
    */
   void resetBwd();
+
+  void convertOutputToOtherDevice() override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 63e29f447eede5ff9df8715bc9140b64ab7f7d17..b983b833d510b823c5d4cff0b9390173e4cefc89 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "Layer.h"
 #include "MKLDNNBase.h"
 #include "mkldnn.hpp"
+#include "paddle/math/MKLDNNMatrix.h"
 
 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);
 
 namespace paddle {
 
@@ -52,15 +52,15 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
-  std::shared_ptr<mkldnn::memory> inVal_;
-  std::shared_ptr<mkldnn::memory> inGrad_;
-  std::shared_ptr<mkldnn::memory> outVal_;
-  std::shared_ptr<mkldnn::memory> outGrad_;
-  std::shared_ptr<mkldnn::memory> wgtVal_;
-  std::shared_ptr<mkldnn::memory> wgtGrad_;
-  std::shared_ptr<mkldnn::memory> biasVal_;
-  std::shared_ptr<mkldnn::memory> biasGrad_;
+  // MKLDNNMatrixPtr
+  MKLDNNMatrixPtr inVal_;
+  MKLDNNMatrixPtr inGrad_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
@@ -83,17 +83,21 @@ public:
 
   virtual bool init(const LayerMap& layerMap,
                     const ParameterMap& parameterMap) {
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    CHECK(!useGpu_) << "Do not support GPU yet";
+
+    // set device id before Layer::init
+    setDevice(MKLDNN_DEVICE);
+    // change param device to MKLDNN device
+    setParamsDevice(MKLDNN_DEVICE, parameterMap);
     if (!Layer::init(layerMap, parameterMap)) {
       return false;
     }
 
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
     stream_.reset(new MKLDNNStream());
     engine_ = CPUEngine::Instance().getEngine();
-
-    // TODO(TJ): deivecId
     return true;
   }
 
@@ -109,6 +113,12 @@ public:
    */
   virtual void convertWeightsToPaddle() {}
 
+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
+
   /**
    * print info about sizes
    */
@@ -118,14 +128,124 @@ public:
                        << ", oh: " << oh_ << ", ow: " << ow_;
   }
 
-  // TODO(TJ): move to MkldnnMatrix
-  // create memory desc
-  inline mkldnn::memory::desc createMD(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
-    // TODO(TJ): isFmtSuppoted(fmt)
-    return mkldnn::memory::desc(dims, type, fmt);
+  /**
+   * Print the mkldnn memory format flow of value
+   */
+  virtual void printValueFormatFlow() {
+    if (inVal_ && outVal_) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
+                        << " >>> " << outVal_->getFormat();
+    }
+  }
+
+  /**
+   * Print the mkldnn memory format flow of grad
+   */
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
+  }
+
+protected:
+  /**
+   * copy image size and sequence info to other device
+   * @note: can not directly use Layer::copyOutputToOtherDevice since here only
+   *        copy base info and do not copy data value
+   */
+  void copyOutputInfoToOtherDevice() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (inputIsOnlyMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
+
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
   }
 };
 
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199f21adb1a5923b590e4f0e716fc67effb2a2d1
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8329a02f571bf3b5422134c756c248f77fd517b1
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 0a1e17b9aa57b373f0df6e079341729539f4e193..e83ae34bbe7d31b9bb7c16bc3fa84db7bd4e33d2 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -48,7 +48,16 @@ public:
                  << inputLayers_.size() << ") at " << getName();
     }
     s << format.substr(pos);
-    LOG(INFO) << s.str();
+
+    const std::string delimiter("\n");
+    std::string content = s.str();
+    std::string::size_type foundPos = 0;
+    std::string::size_type prevPos = 0;
+    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
+      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
+      prevPos = foundPos + delimiter.size();
+    }
+    LOG(INFO) << content.substr(prevPos);
   }
 
   void backward(const UpdateCallback& callback) override {}
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 346c01ced648e47a5516c810e1e975a3a5ed2394..de9b8e63dfc4291f8f42ca8c57cb5eb6baed8d8e 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,6 +34,13 @@ add_unittest_without_exec(test_CRFLayerGrad
 add_test(NAME test_CRFLayerGrad
     COMMAND test_CRFLayerGrad)
 
+################ test_CrossEntropyOverBeam ####################
+add_unittest_without_exec(test_CrossEntropyOverBeam
+    test_CrossEntropyOverBeamGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_CrossEntropyOverBeam
+    COMMAND test_CrossEntropyOverBeam)
+
 ################ test_SeqSliceLayerGrad ####################
 add_unittest_without_exec(test_SeqSliceLayerGrad
     test_SeqSliceLayerGrad.cpp
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..538d18cdc3d262df0ddb031d9e6b38a3fea57606
--- /dev/null
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+
+const size_t SEED = (size_t)(time(NULL));
+
+struct SingleBeamExpansion {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<real> candidateScores;
+
+  // TODO(caoying): store this into Argument.ids
+  vector<real> selectedIndices;
+
+  vector<int> groundTruth;
+  vector<size_t> inBeam;
+  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+
+    inBeam.clear();
+    inBeam.resize(n, 0);
+
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
+};
+
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
+
+void genRand(real* numbers, size_t n) {
+  default_random_engine generator;
+  uniform_real_distribution<real> distribution(0.0, 1.0);
+  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genCandidateScores(bool hasSubseq,
+                        size_t beamSize,
+                        SingleBeamExpansion& prevBeam,
+                        SingleBeamExpansion& curBeam) {
+  vector<int>& seqStartPos = curBeam.seqStartPos;
+  seqStartPos.resize(1, 0);
+  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  if (prevBeam.selectedIndices.size()) {
+    if (prevBeam.subSeqStartPos.size() > 1) {
+      int seqIdx = 1;
+      // samples in previous beam are nested sequences.
+      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
+        for (size_t j = 0; j < beamSize; ++j) {
+          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
+        }
+        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          seqIdx++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
+        if (i && i % beamSize == 0) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          if (i == prevBeam.selectedIndices.size()) break;
+        }
+        if (prevBeam.selectedIndices[i] == -1.) continue;
+        subSeqStartPos.push_back(subSeqStartPos.back() +
+                                 (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  } else {
+    // the first beam expansion
+    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int i = 0; i < seqNum; ++i) {
+      if (hasSubseq) {
+        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
+          subSeqStartPos.push_back(subSeqStartPos.back() +
+                                   (1 + (rand() % MAX_SEQ_LEN)));
+        seqStartPos.push_back(subSeqStartPos.back());
+      } else {
+        seqStartPos.push_back(seqStartPos.back() +
+                              (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  }
+
+  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
+  curBeam.candidateScores.resize(totalSeqNum, 0.);
+  genRand(curBeam.candidateScores.data(), totalSeqNum);
+}
+
+void genSelectedIndices(size_t beamSize,
+                        vector<int>& seqStartPos,
+                        vector<real>& selectedIndices) {
+  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
+  selectedIndices.resize(selectedIdsCount, -1.);
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    int n = min(seqLen, static_cast<int>(beamSize));
+    vector<real> ids = randSampling(seqLen, n);
+    memcpy(selectedIndices.data() + i * beamSize,
+           ids.data(),
+           sizeof(real) * ids.size());
+  }
+}
+
+void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
+                    size_t beamSize) {
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
+  for (size_t i = 2; i < beamExpansions.size(); ++i)
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
+
+  srand(SEED);
+
+  // initialize the first beam.
+  beam.resetGroundTruth(seqNum);
+  for (size_t i = 0; i < seqNum; ++i) {
+    if (randFloat() > 0.5) {
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
+    beam.rowIdxInBeam[i] = i;
+  }
+
+  // iterate over each beam expansions
+  for (size_t i = 2; i < beamExpansions.size(); ++i) {
+    SingleBeamExpansion& curBeam = beamExpansions[i];
+    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
+
+    // iterate over each sequence
+    for (size_t j = 0; j < seqNum; ++j) {
+      if (!prevBeam.inBeam[j]) continue;
+
+      // gold sequence falls in the beam in previous search.
+      real* begPos = prevBeam.selectedIndices.data();
+      int offset =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
+      curBeam.rowIdxInBeam[j] = count_if(
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
+        CHECK_LE(curBeam.rowIdxInBeam[j] + 1,
+                 curBeam.subSeqStartPos.size() - 1);
+        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
+        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
+        CHECK_GT(size_t(end), size_t(start));
+        int label = rand() % (end - start);
+
+        curBeam.groundTruth[j] = label;
+        real* findBeg =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        real* lblPos =
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
+          curBeam.inBeam[j] = 1;
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
+        }
+      }
+    }
+  }
+}
+
+void genOneBeam(size_t beamSize,
+                bool hasSubseq,
+                SingleBeamExpansion& prevBeam,
+                SingleBeamExpansion& curBeam) {
+  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
+  genSelectedIndices(beamSize,
+                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
+                     curBeam.selectedIndices);
+}
+
+void genRandomBeamExpansion(size_t expansionCount,
+                            size_t beamSize,
+                            vector<SingleBeamExpansion>& beamExpansions) {
+  beamExpansions.clear();
+  beamExpansions.resize(expansionCount + 1);
+
+  // beamExpansions[0] is reserved.
+  for (size_t i = 1; i <= expansionCount; ++i)
+    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
+  genGroundTruth(beamExpansions, beamSize);
+}
+
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
+  TestConfig config;
+  config.layerConfig.set_type("cross_entropy_over_beam");
+
+  size_t seqNum = 0;
+  for (size_t i = 1; i < beams.size(); ++i) {
+    const SingleBeamExpansion& beam = beams[i];
+    // create scores for all the candidates
+    MatrixPtr candidateScorePtr =
+        Matrix::create(beam.candidateScores.size(), 1, false, false);
+    candidateScorePtr->copyFrom(beam.candidateScores.data(),
+                                beam.candidateScores.size());
+
+    ostringstream paramName;
+    paramName << "candidate_scores_" << i;
+
+    if (beam.subSeqStartPos.size() > 1) {
+      seqNum = beam.subSeqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos,
+                                  beam.subSeqStartPos});
+    } else {
+      seqNum = beam.seqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos});
+    }
+    config.layerConfig.add_inputs();
+
+    // create indices for the selected candidates
+    MatrixPtr selectedCandidates =
+        Matrix::create(seqNum, beamSize, false, false);
+    selectedCandidates->copyFrom(beam.selectedIndices.data(),
+                                 beam.selectedIndices.size());
+    paramName.clear();
+    paramName << "selected_candidates_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
+    config.layerConfig.add_inputs();
+
+    // create the ground truth
+    paramName.clear();
+    paramName << "label_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
+    config.layerConfig.add_inputs();
+  }
+
+  testLayerGrad(
+      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
+}
+
+TEST(Layer, CrossEntropyOverBeam) {
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+
+  // TODO(caoying): test with random beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(SEED);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 9946f7666498e27a3149816c67ff4c9a9f3bb02a..a831ffbc73fbd6ad42fa31b2d6d583718474e59b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -850,9 +850,27 @@ TEST(Layer, square_error_weighted) {
   }
 }
 
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
+
 TEST(Layer, huber_two_class) {
   TestConfig config;
-  config.layerConfig.set_type("huber");
+  config.layerConfig.set_type("huber_classification");
   config.biasSize = 0;
 
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
@@ -861,7 +879,7 @@ TEST(Layer, huber_two_class) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
   }
 }
 
@@ -1228,6 +1246,75 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 void testSppLayer(const string& poolType,
                   const int pyramidHeight,
                   bool trans,
@@ -2029,6 +2116,159 @@ TEST(Layer, RowL2NormLayer) {
   }
 }
 
+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(imageSize(conv->img_size(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_output_y(imageSize(conv->img_size_y(),
+                               conv->filter_size_y(),
+                               conv->padding_y(),
+                               conv->stride_y(),
+                               true));
+  conv->set_output_z(imageSize(conv->img_size_z(),
+                               conv->filter_size_z(),
+                               conv->padding_z(),
+                               conv->stride_z(),
+                               true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 TEST(Layer, ScaleShiftLayer) {
   const size_t batchSize = 16;
   const size_t size = 32;
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 666a8b8368e3e2ebc522902c176d7491d2920d2a..94ef561f066a127496e2849a419835e175c526d7 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
     CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
     CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
     return ptr;
   }
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index bf28092e82b778dc904c5a2e271f76261cf5f6b6..68b5296228cd733dc3cb7ca0f762e0a69187dbff 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -14,6 +14,17 @@
 #
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
 set(MATH_SOURCES
     "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
     "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a355e2644cce572ce90ecf5c9d2a5b7b395bc61
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNMatrix.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
+  CHECK(ndims > 0) << "Input dims should not be empty";
+  size_t cnts = 1;
+  for (size_t i = 0; i < ndims; ++i) {
+    cnts *= dims[i];
+  }
+
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    m = Matrix::create(height, width, false, false);
+  }
+
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
+}
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+}
+
+void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
+                                   memory::format srcFmt,
+                                   memory::dims targetDim) {
+  memory::format dstFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
+                                 memory::format dstFmt,
+                                 memory::dims targetDim) {
+  memory::format srcFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderOnce(void* srcData,
+                               void* dstData,
+                               memory::format srcFmt,
+                               memory::format dstFmt,
+                               memory::dims dm) {
+  CHECK(srcData);
+  CHECK(dstData);
+  MatrixPtr tmpSrc;
+  if (dstData == srcData) {
+    // inplace data
+    size_t sz = 1;
+    for (size_t i = 0; i < dm.size(); ++i) {
+      sz *= dm[i];
+    }
+    tmpSrc = Matrix::create(sz, 1, false, false);
+    tmpSrc->copyFrom((real*)srcData, sz);
+    srcData = tmpSrc->getData();
+  }
+
+  auto dtype = this->getDtype();
+  auto srcMD = memory::desc(dm, dtype, srcFmt);
+  auto dstMD = memory::desc(dm, dtype, dstFmt);
+
+  auto eg = this->getEngine();
+  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
+  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
+
+  auto r = reorder(src, dst);
+  stream(stream::kind::eager).submit({r}).wait();
+}
+
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
+  const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  mkldnn_primitive_t result;
+  mkldnn::error::wrap_c_api(
+      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+      "could not create a memory primitive");
+  reset(result);
+  set_data_handle(getData());
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..e50f698b495713e6f15ab7a12a7ee7487662040f
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+public:
+  MKLDNNMatrix(real* data,
+               size_t height,
+               size_t width,
+               mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      MatrixPtr m,
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+};
+
+}  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 27f7d95b752d4a423bf99fa425b10b2816575d6a..8bc42571f7c141aa31e18d0504b95b2ed4f0da77 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1190,6 +1190,221 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       outGrad.getStride());
 }
 
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
+
+  real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       maxPoolIdxData,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == getWidth());
+  CHECK(width_ == depth * width * height * channels);
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        maxPoolIdxData,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        outGrad.getStride());
+}
+
 void GpuMatrix::maxSequenceForward(Matrix& input,
                                    const IVector& sequence,
                                    IVector& index) {
@@ -1389,6 +1604,72 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
       output_d, grad_d, mat_d, height_, width_);
 }
 
+void GpuMatrix::vol2Col(real* dataSrc,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  hl_matrix_vol2Col(dataSrc,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData());
+}
+
+void GpuMatrix::col2Vol(real* dataDst,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  hl_matrix_col2Vol(dataDst,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData(),
+                    alpha,
+                    beta);
+}
+
 /**
  * CpuMatrix
  */
@@ -1930,6 +2211,276 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
   }
 }
 
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t inWidth = imgSizeW;
+  size_t inHeight = imgSizeH;
+  size_t inDepth = imgSizeD;
+  CHECK(inHeight * inWidth * inDepth == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW * outputD, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth);
+            int hend = std::min(hstart + sizeY, inHeight);
+            int wend = std::min(wstart + sizeX, inWidth);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            int maxIdx = -1;
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  if (maxOutData <
+                      inputData[(d * inHeight + h) * inWidth + w]) {
+                    maxOutData = inputData[(d * inHeight + h) * inWidth + w];
+                    maxIdx = (d * inHeight + h) * inWidth + w;
+                  }
+                }
+              }
+            }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
+          }
+        }
+      }
+      // compute offset
+      inputData += inDepth * inHeight * inWidth;
+      outData += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = getHeight();
+  size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = getData();
+  real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t outStride = outGrad.getStride();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outGrad.isContiguous()) {
+      otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
+          }
+        }
+      }
+      // offset
+      tgtGrad += imgSizeD * imgSizeH * imgSizeW;
+      otGrad += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inDepth = imgSizeD;
+  size_t inHeight = imgSizeH;
+  size_t inWidth = imgSizeW;
+  CHECK(inDepth * inHeight * inWidth * channels == input.getWidth());
+  CHECK(outputD * outputH * outputW * channels * num == height_ * width_);
+  real* tgtData = getData();
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth + paddingD);
+            int hend = std::min(hstart + sizeY, inHeight + paddingH);
+            int wend = std::min(wstart + sizeX, inWidth + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(inDepth));
+            hend = std::min(hend, static_cast<int>(inHeight));
+            wend = std::min(wend, static_cast<int>(inWidth));
+
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * inHeight + h) * inWidth + w];
+                }
+              }
+            }
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inDepth * inHeight * inWidth;
+      tgtData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, imgSizeD + paddingD);
+            int hend = std::min(hstart + sizeY, imgSizeH + paddingH);
+            int wend = std::min(wstart + sizeX, imgSizeW + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(imgSizeD));
+            hend = std::min(hend, static_cast<int>(imgSizeH));
+            wend = std::min(wend, static_cast<int>(imgSizeW));
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += imgSizeD * imgSizeH * imgSizeW;
+      inData += outputD * outputH * outputW;
+    }
+  }
+}
+
 /**
  * Input: one or more sequences. Each sequence contains some instances.
  * Output: output size is the number of input sequences (NOT input instances).
@@ -3975,6 +4526,95 @@ void CpuMatrix::bilinearBackward(const Matrix& out,
   }
 }
 
+void CpuMatrix::vol2Col(real* data,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  real* outData = getData();
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIn = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
+                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
+          else
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::col2Vol(real* trg,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  real* src = getData();
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIm = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
+                alpha *
+                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
+                beta *
+                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
+        }
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////
 //               functions executed via cpu                   //
 ////////////////////////////////////////////////////////////////
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index bb802bbb2c75289a45d987b22ad41ce8b1e95c98..431d4e071072317c8fdfdc4f0d13e7cd4e3d062b 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -928,15 +928,102 @@ public:
                                size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
-
   /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
    */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
   virtual void maxSequenceForward(Matrix& input,
                                   const IVector& sequence,
                                   IVector& index) {
@@ -1039,6 +1126,42 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
   virtual void bilinearForward(const Matrix& in,
                                const size_t inImgH,
                                const size_t inImgW,
@@ -1348,6 +1471,82 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1374,6 +1573,38 @@ public:
                         const real ratioH,
                         const real ratioW);
 
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
   void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
 
   void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
@@ -1507,6 +1738,82 @@ public:
                        size_t paddingH,
                        size_t paddingW);
 
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1715,6 +2022,38 @@ public:
                         const real ratioH,
                         const real ratioW);
 
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
   template <typename ExpressionType>
   void operator=(const ExpressionType& expr) {
     TensorCpuApply<real>(*this, expr);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index d77478f345df97b37b214b5978f51ce47c1d791c..103f06acc57d7a23f019f5e713f6cacf2179e9e0 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -1203,4 +1204,497 @@ TEST(Matrix, warpCTC) {
   }
 }
 
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
 #endif
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f0fd12f1b5276d033ea086c60c80616fb1be7585..e5efcccb0e219a1c9df888cfec7f8902806676d4 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -1,7 +1,10 @@
+file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
     # for ops.
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
     set(op_common_deps operator op_registry)
@@ -43,33 +46,26 @@ endfunction()
 
 add_subdirectory(math)
 
-cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-op_library(gather_op SRCS gather_op.cc gather_op.cu)
-
-cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-op_library(scatter_op SRCS scatter_op.cc scatter_op.cu)
-
-cc_library(net_op SRCS net_op.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
-
-op_library(add_op SRCS add_op.cc add_op.cu)
-
-op_library(mean_op SRCS mean_op.cc mean_op.cu)
+list(REMOVE_ITEM GENERAL_OPS
+     net_op
+     minus_op
+     mul_op
+     recurrent_op
+     scale_op)
 
+op_library(net_op SRCS net_op.cc)
+op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
+  DEPS framework_proto tensor operator net_op)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
 
-op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
-op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
-op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
-op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
-op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
+foreach(src ${GENERAL_OPS})
+    op_library(${src} SRCS ${src}.cc ${src}.cu)
+endforeach()
 
-op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
-op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
-    DEPS framework_proto tensor op_registry operator net_op)
-op_library(uniform_random_op SRCS uniform_random_op.cc uniform_random_op.cu)
-op_library(lookup_table_op SRCS lookup_table_op.cc lookup_table_op.cu)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index 44d925f0b0cc5ff20d52e548816f118c2027343a..78b5e2767842312722fac3509e843a05fe194559 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -31,10 +31,13 @@ void NetOp::CompleteAddOp(bool calc) {
   for (auto& op : ops_) {
     for (auto& ipt : op->Inputs()) {
       for (auto& var_name : ipt.second) {
-        if (!Contains(output_set, var_name)) {  // Not other op's output
-          input_set.insert(var_name);
-        } else {
+        // If input variable has been in output set, then it will be
+        // added into intermediate_outputs_. Otherwise, it will be
+        // added into input set.
+        if (Contains(output_set, var_name)) {
           intermediate_outputs_.insert(var_name);
+        } else {
+          input_set.insert(var_name);
         }
       }
     }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 2b945de18a4cdc3712ac7e282494ed7d3ecc600d..8dbef0b22e7b2f14c62586f86e686356b6e9c68e 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }
 
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
   dataId = src.dataId;
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
@@ -677,6 +679,7 @@ void Argument::reorganizeSeqInfo(
     const ICpuGpuVectorPtr subSeqStartPos,
     std::vector<std::vector<int>>& reorganizedSeqInfo) {
   CHECK(seqStartPos);
+  reorganizedSeqInfo.clear();
 
   int seqNum = seqStartPos->getSize() - 1;
   int* seqStarts = seqStartPos->getMutableData(false);
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 38797a76f55c311070192bd307103143d67cabca..7b59199dded5b3f5d030e389d8bfcac1668fd127 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
         strs(nullptr),
         frameHeight(0),
         frameWidth(0),
+        frameDepth(0),
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
     frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
     dataId = argument.dataId;
   }
 
@@ -76,6 +75,7 @@ struct Argument {
   // A dataBatch includes batchSize frames, one frame maybe not only vector
   size_t frameHeight;
   size_t frameWidth;
+  size_t frameDepth;
 
   // If NULL, each position is treated independently.
   // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
   }
   size_t getFrameHeight() const { return frameHeight; }
   size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
   void setFrameHeight(size_t h) { frameHeight = h; }
   void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
 
   int64_t getNumSequences() const {
     return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 321f4275d8e68d7d3fbbc19acf0afacf689474e5..04f12efaac15a21ef54ae71074b6d474e2b66c04 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -281,7 +281,11 @@ public:
   /**
    * @brief Set the format in header.
    */
-  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
 
   /**
    * @brief  Parameter Update Hook.
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 8616fd2d5aef666f16533fe062f3f40a7a2b202d..4203f2616456244df616ee2109436ab7caef9741 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include <arpa/inet.h>
 #include <net/if.h>
-#include <net/if_arp.h>
 #include <sys/ioctl.h>
 #include <sstream>
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 37e186a408ff5f560b5878e3e51ea81ca5810bc7..00030050700bfb2cee224124d090b0027d456ba0 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -2,21 +2,5 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
     SRCS pybind.cc
     DEPS pybind python backward
-    sgd_op
-    gather_op
-    scatter_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op
-    lookup_table_op
-    scale_op
-    minus_op)
+    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2941662f349baf57d1fe8188e88ce21d5de07750..17986420220fec173bbf3ecff240d4c504f8adbd 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -38,7 +38,7 @@ Configuring cmake in /paddle/build ...
       -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-      -DWITH_TESTING=${WITH_TESTING:-OFF}
+      -DWITH_TESTING=${WITH_TESTING:-ON}
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================
 EOF
@@ -56,19 +56,18 @@ cmake .. \
       -DWITH_C_API=${WITH_C_API:-OFF} \
       -DWITH_PYTHON=${WITH_PYTHON:-ON} \
       -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DWITH_TESTING=${WITH_TESTING:-OFF} \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+      -DWITH_TESTING=${WITH_TESTING:-ON} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 
 cat <<EOF
 ============================================
 Building in /paddle/build ...
-   Build unit tests: ${WITH_TESTING:-OFF}
 ============================================
 EOF
 make -j `nproc`
 
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
+if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
 cat <<EOF
 ========================================
 Running unit tests ...
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 1ea1e052596524f5baa0a55f601c4fa928acd8af..4ddf023780c704cb10c51ee9e5d7cb63420f9d73 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -85,6 +85,12 @@ message ConvConfig {
 
   optional uint32 dilation = 15 [ default = 1 ];
   optional uint32 dilation_y = 16 [ default = 1 ];
+
+  optional uint32 filter_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
+  optional uint32 stride_z = 19 [ default = 1 ];
+  optional uint32 output_z = 20 [ default = 1 ];
+  optional uint32 img_size_z = 21 [ default = 1 ];
 }
 
 message PoolConfig {
@@ -127,6 +133,12 @@ message PoolConfig {
 
   // if not set, use padding
   optional uint32 padding_y = 13;
+
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
 }
 
 message SppConfig {
@@ -499,6 +511,11 @@ message LayerConfig {
   optional int32 axis = 54 [ default = 2 ];
   repeated uint32 offset = 55;
   repeated uint32 shape = 56;
+
+  // for HuberRegressionLoss
+  optional double delta = 57 [ default = 1.0 ];
+
+  optional uint64 depth = 58 [ default = 1 ];
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index efc76764662b3832dbacc6c8a3c2bca4ccbe4cd8..152a56190c1ffddbf9590ed8f71308ceb88403f4 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -886,6 +886,36 @@ class Conv(Cfg):
             config_assert(output_x <= 0)
 
 
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Conv3D(Cfg):
+    def __init__(self,
+                 filter_size,
+                 channels,
+                 padding=None,
+                 stride=None,
+                 groups=None,
+                 filter_channels=None,
+                 output_x=None,
+                 img_size=None,
+                 caffe_mode=True,
+                 filter_size_y=None,
+                 padding_y=None,
+                 stride_y=None,
+                 filter_size_z=None,
+                 padding_z=None,
+                 stride_z=None):
+        self.add_keys(locals())
+        self.filter_size_y = filter_size_y if filter_size_y else filter_size
+        self.filter_size_z = filter_size_z if filter_size_z else filter_size
+        self.padding_y = padding_y if padding_y else padding
+        self.padding_z = padding_z if padding_z else padding
+        self.stride_y = stride_y if stride_y else stride
+        self.stride_z = stride_z if stride_z else stride
+        if output_x is not None:
+            config_assert(output_x <= 0)
+
+
 @config_class
 class BilinearInterp(Cfg):
     def __init__(self, out_size_x=None, out_size_y=None, channels=None):
@@ -908,6 +938,31 @@ class Pool(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Pool3d(Cfg):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y=None,
+            size_z=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            stride_z=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None,
+            padding_z=None):
+        self.add_keys(locals())
+        self.filter_size_y = size_y if size_y else size_x
+        self.filter_size_z = size_z if size_z else size_x
+        self.padding_y = padding_y if padding_y else padding
+        self.padding_z = padding_z if padding_z else padding
+        self.stride_y = stride_y if stride_y else stride
+        self.stride_z = stride_z if stride_z else stride
+
+
 @config_class
 class SpatialPyramidPool(Cfg):
     def __init__(self, pool_type, pyramid_height, channels):
@@ -1172,6 +1227,20 @@ def get_img_size(input_layer_name, channels):
     return img_size, img_size_y
 
 
+def get_img3d_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width
+    img_size_y = input.height
+    img_size_z = input.depth
+
+    config_assert(
+        img_size * img_size_y * img_size_z == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_size_z, img_pixels))
+    return img_size, img_size_y, img_size_z
+
+
 def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
     bilinear_conf.out_size_x = bilinear.out_size_x
@@ -1209,6 +1278,45 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
                                          pool_conf.stride_y, not ceil_mode)
 
 
+def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
+    pool_conf.pool_type = pool.pool_type
+    config_assert(pool.pool_type in ['max-projection', 'avg-projection'],
+                  "pool-type %s is not in "
+                  "['max-projection', 'avg-projection']" % pool.pool_type)
+
+    pool_conf.channels = pool.channels
+
+    pool_conf.size_x = pool.size_x
+    pool_conf.stride = pool.stride
+    pool_conf.padding = pool.padding
+
+    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
+    pool_conf.size_z = default(pool.size_z, pool_conf.size_x)
+    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
+    pool_conf.stride_z = default(pool.stride_z, pool_conf.stride)
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+
+    pool_conf.img_size, pool_conf.img_size_y, pool_conf.img_size_z = \
+        get_img3d_size(input_layer_name, pool.channels)
+
+    config_assert(not pool.start, "start is deprecated in pooling.")
+
+    if pool.padding is not None:
+        pool_conf.padding = pool.padding
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         not ceil_mode)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, not ceil_mode)
+    pool_conf.output_z = cnn_output_size(pool_conf.img_size_z, pool_conf.size_z,
+                                         pool_conf.padding_z,
+                                         pool_conf.stride_z, not ceil_mode)
+
+
 def parse_spp(spp, input_layer_name, spp_conf):
     parse_image(spp, input_layer_name, spp_conf.image_conf)
     spp_conf.pool_type = spp.pool_type
@@ -1282,6 +1390,50 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
             conv_conf.stride_y, conv_conf.caffe_mode)
 
 
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
+def parse_conv3d(conv, input_layer_name, conv_conf, num_filters, trans=False):
+    conv_conf.filter_size = conv.filter_size
+    conv_conf.filter_size_y = conv.filter_size_y
+    conv_conf.filter_size_z = conv.filter_size_z
+    conv_conf.channels = conv.channels
+    conv_conf.padding = conv.padding
+    conv_conf.padding_y = conv.padding_y
+    conv_conf.padding_z = conv.padding_z
+    conv_conf.stride = conv.stride
+    conv_conf.stride_y = conv.stride_y
+    conv_conf.stride_z = conv.stride_z
+    conv_conf.groups = conv.groups
+    conv_conf.caffe_mode = conv.caffe_mode
+
+    if not trans:
+        conv_conf.filter_channels = conv.channels / conv.groups
+        conv_conf.img_size, conv_conf.img_size_y, conv_conf.img_size_z = \
+            get_img3d_size(input_layer_name, conv.channels)
+        conv_conf.output_x = cnn_output_size(
+            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.output_y = cnn_output_size(
+            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
+        conv_conf.output_z = cnn_output_size(
+            conv_conf.img_size_z, conv_conf.filter_size_z, conv_conf.padding_z,
+            conv_conf.stride_z, conv_conf.caffe_mode)
+    else:
+        conv_conf.filter_channels = num_filters / conv.groups
+        conv_conf.output_x, conv_conf.output_y, conv_conf.output_z = \
+            get_img3d_size(input_layer_name, conv.channels)
+        conv_conf.img_size = cnn_image_size(
+            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.img_size_y = cnn_image_size(
+            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
+        conv_conf.img_size_z = cnn_image_size(
+            conv_conf.output_z, conv_conf.filter_size_z, conv_conf.padding_z,
+            conv_conf.stride_z, conv_conf.caffe_mode)
+
+
 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
     block_expand_conf.channels = block_expand.channels
     block_expand_conf.stride_x = block_expand.stride_x
@@ -1585,6 +1737,9 @@ class LayerBase(object):
         self.config.height = height
         self.config.width = width
 
+    def set_layer_depth(self, depth):
+        self.config.depth = depth
+
     def set_cnn_layer(self,
                       input_layer_name,
                       height,
@@ -1607,6 +1762,21 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
         self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
 
 
+@config_layer('cross_entropy_over_beam')
+class CrossEntropyOverBeamLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        config_assert(len(inputs) % 3 == 0, "Error input number.")
+        super(CrossEntropyOverBeamLayer, self).__init__(
+            name, 'cross_entropy_over_beam', 0, inputs, **xargs)
+        input_num = len(inputs) / 3
+        for i in range(input_num):
+            input_layer = self.get_input_layer(i * 3)
+            config_assert(input_layer.size == 1, (
+                "Inputs for this layer are made up of "
+                "several triples, in which the first one is scores over "
+                "all candidate paths, whose size should be equal to 1."))
+
+
 @config_layer('fc')
 class FCLayer(LayerBase):
     layer_type = 'fc'
@@ -1788,11 +1958,19 @@ class DetectionOutputLayer(LayerBase):
 
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, height=None, width=None, device=None):
+    def __init__(self,
+                 name,
+                 size,
+                 depth=None,
+                 height=None,
+                 width=None,
+                 device=None):
         super(DataLayer, self).__init__(
             name, 'data', size, inputs=[], device=device)
         if height and width:
             self.set_layer_height_width(height, width)
+        if depth:
+            self.set_layer_depth(depth)
 
 
 '''
@@ -1907,7 +2085,7 @@ class ConvLayerBase(LayerBase):
 
     def calc_parameter_size(self, conv_conf):
         return self.config.num_filters * conv_conf.filter_channels \
-                    * (conv_conf.filter_size * conv_conf.filter_size_y)
+               * (conv_conf.filter_size * conv_conf.filter_size_y)
 
 
 @config_layer('exconv')
@@ -1991,6 +2169,87 @@ class ConvTransLayer(ConvTransLayerBase):
     layer_type = 'cudnn_convt'
 
 
+@config_layer('conv_3d')
+class Conv3DLayerBase(LayerBase):
+    def __init__(self,
+                 name,
+                 inputs=[],
+                 bias=True,
+                 num_filters=None,
+                 shared_biases=True,
+                 **xargs):
+        super(Conv3DLayerBase, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
+
+        if num_filters is not None:
+            self.config.num_filters = num_filters
+
+        # need to specify layer in config
+        self.config.type = self.layer_type
+
+        trans = False
+        if self.config.type == "deconv3d":
+            trans = True
+
+        if shared_biases is not None:
+            self.config.shared_biases = shared_biases
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            conv_conf = self.config.inputs[input_index].conv_conf
+            parse_conv3d(
+                self.inputs[input_index].conv,
+                input_layer.name,
+                conv_conf,
+                num_filters,
+                trans=trans
+            )  # for z-axis pad:0, strid:1, filter_size:1, img_size:1
+            psize = self.calc_parameter_size(conv_conf)
+            self.create_input_parameter(input_index, psize)
+            if trans:
+                self.set_cnn_layer(name, conv_conf.img_size_z,
+                                   conv_conf.img_size_y, conv_conf.img_size,
+                                   self.config.num_filters)
+            else:
+                self.set_cnn_layer(name, conv_conf.output_z, conv_conf.output_y,
+                                   conv_conf.output_x, self.config.num_filters)
+
+        psize = self.config.size
+        if shared_biases:
+            psize = self.config.num_filters
+        self.create_bias_parameter(bias, psize, [psize, 1])
+
+    def calc_parameter_size(self, conv_conf):
+        return self.config.num_filters * conv_conf.filter_channels \
+               * (conv_conf.filter_size * conv_conf.filter_size_y \
+                  * conv_conf.filter_size_z)
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
+
+@config_layer('conv3d')
+class Conv3DLayer(Conv3DLayerBase):
+    layer_type = 'conv3d'
+
+
+@config_layer('deconv3d')
+class Conv3DLayer(Conv3DLayerBase):
+    layer_type = 'deconv3d'
+
+
 @config_layer('norm')
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
@@ -2020,6 +2279,35 @@ class PoolLayer(LayerBase):
                                pool_conf.channels)
 
 
+@config_layer('pool3d')
+class Pool3DLayer(LayerBase):
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(Pool3DLayer, self).__init__(
+            name, 'pool3d', 0, inputs=inputs, **xargs)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            pool_conf = self.config.inputs[input_index].pool_conf
+            parse_pool3d(self.inputs[input_index].pool, input_layer.name,
+                         pool_conf, ceil_mode)
+            self.set_cnn_layer(name, pool_conf.output_z, pool_conf.output_y,
+                               pool_conf.output_x, pool_conf.channels)
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
+
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
@@ -2268,13 +2556,14 @@ def define_cost(class_name, cost_type):
 
 
 define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
+define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam')
 define_cost('RankingCost', 'rank-cost')
 define_cost('AucValidation', 'auc-validation')
 define_cost('PnpairValidation', 'pnpair-validation')
 define_cost('SumOfSquaresCostLayer', 'square_error')
 define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
-define_cost('HuberTwoClass', 'huber')
+define_cost('HuberTwoClassification', 'huber_classification')
 define_cost('SumCost', 'sum_cost')
 define_cost('SmoothL1Cost', 'smooth_l1')
 
@@ -2336,6 +2625,17 @@ class LambdaCost(LayerBase):
         self.config.max_sort_size = max_sort_size
 
 
+@config_layer('huber_regression')
+class HuberRegressionLoss(LayerBase):
+    def __init__(self, name, inputs, delta=1., coeff=1., device=None):
+        super(HuberRegressionLoss, self).__init__(
+            name, 'huber_regression', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
+        self.config.delta = delta
+        self.config.coeff = coeff
+
+
 @config_layer('nce')
 class NCELayer(LayerBase):
     def __init__(self,
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
old mode 100755
new mode 100644
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
old mode 100755
new mode 100644
index 862265f2cdeef1da5623bfe618008030caa98636..2bd274fad2ab7eed0902ffe944c6e0670f963233
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import functools
 import collections
 import inspect
@@ -106,11 +105,14 @@ __all__ = [
     'nce_layer',
     'cross_entropy_with_selfnorm',
     'cross_entropy',
+    'BeamInput',
+    'cross_entropy_over_beam',
     'multi_binary_label_cross_entropy',
     'sum_cost',
     'rank_cost',
     'lambda_cost',
-    'huber_cost',
+    'huber_regression_cost',
+    'huber_classification_cost',
     'block_expand_layer',
     'maxout_layer',
     'out_prod_layer',
@@ -135,8 +137,10 @@ __all__ = [
     'clip_layer',
     'slice_projection',
     'seq_slice_layer',
-    'kmax_sequence_score_layer',
+    'kmax_seq_score_layer',
+    'img_pool3d_layer',
     'scale_shift_layer',
+    'img_conv3d_layer',
 ]
 
 
@@ -165,6 +169,7 @@ class LayerType(object):
     EXCONVTRANS_LAYER = 'exconvt'
     CUDNNCONV_LAYER = 'cudnn_conv'
     POOL_LAYER = 'pool'
+    POOL3D_LAYER = 'pool3d'
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
@@ -218,11 +223,16 @@ class LayerType(object):
     CRF_DECODING_LAYER = 'crf_decoding'
     NCE_LAYER = 'nce'
 
+    CONV3D_LAYER = 'conv3d'
+    DECONV3D_LAYER = 'deconv3d'
+
     RANK_COST = 'rank-cost'
     LAMBDA_COST = 'lambda_cost'
-    HUBER = 'huber'
+    HUBER_REGRESSION = 'huber_regression'
+    HUBER_CLASSIFICATION = 'huber_classification'
     CROSS_ENTROPY = 'multi-class-cross-entropy'
     CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
+    CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam'
     SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
     MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
     SUM_COST = 'sum_cost'
@@ -892,7 +902,8 @@ def mixed_layer(size=0,
 
 
 @layer_support()
-def data_layer(name, size, height=None, width=None, layer_attr=None):
+def data_layer(name, size, depth=None, height=None, width=None,
+               layer_attr=None):
     """
     Define DataLayer For NeuralNetwork.
 
@@ -919,15 +930,18 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
         type=LayerType.DATA,
         name=name,
         size=size,
+        depth=depth,
         height=height,
         width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
+    if depth is None:
+        depth = 1
     num_filters = None
     if height is not None and width is not None:
-        num_filters = size / (width * height)
-        assert num_filters * width * height == size, \
-            "size=%s width=%s height=%s" % (size, width, height)
+        num_filters = size / (width * height * depth)
+        assert num_filters * width * height * depth == size, \
+                "size=%s width=%s height=%s depth=%s"  % (size, width, height, depth)
 
     return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
 
@@ -2651,6 +2665,146 @@ def img_pool_layer(input,
         size=l.config.size)
 
 
+@wrap_name_default("pool3d")
+@layer_support()
+def img_pool3d_layer(input,
+                     pool_size,
+                     name=None,
+                     num_channels=None,
+                     pool_type=None,
+                     stride=1,
+                     padding=0,
+                     layer_attr=None,
+                     pool_size_y=None,
+                     stride_y=None,
+                     padding_y=None,
+                     pool_size_z=None,
+                     stride_z=None,
+                     padding_z=None,
+                     ceil_mode=True):
+    """
+    Image pooling Layer.
+
+    The details of pooling layer, please refer ufldl's pooling_ .
+
+    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
+
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool3d_layer(input=conv,
+                                 pool_size=3,
+                                 num_channels=8,
+                                 stride=1,
+                                 padding=1,
+                                 pool_type=MaxPooling())
+
+    :param padding: pooling padding width.
+    :type padding: int|tuple|list
+    :param name: name of pooling layer
+    :type name: basestring.
+    :param input: layer's input
+    :type input: LayerOutput
+    :param pool_size: pooling window width
+    :type pool_size: int|tuple|list
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
+                      MaxPooling.
+    :type pool_type: BasePoolingType
+    :param stride: stride width of pooling.
+    :type stride: int|tuple|list
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+    :type ceil_mode: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    type_name = pool_type.name + '-projection' \
+        if (
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        else pool_type.name
+
+    if isinstance(pool_size, collections.Sequence):
+        assert len(pool_size) == 3
+        pool_size, pool_size_y, pool_size_z = pool_size
+    else:
+        pool_size_y = pool_size
+        pool_size_z = pool_size
+
+    if isinstance(stride, collections.Sequence):
+        assert len(stride) == 3
+        stride, stride_y, stride_z = stride
+    else:
+        stride_y = stride
+        stride_z = stride
+
+    if isinstance(padding, collections.Sequence):
+        assert len(padding) == 3
+        padding, padding_y, padding_y = padding
+    else:
+        padding_y = padding
+        padding_z = padding
+
+    l = Layer(
+        name=name,
+        type=LayerType.POOL3D_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                pool=Pool3d(
+                    pool_type=type_name,
+                    channels=num_channels,
+                    size_x=pool_size,
+                    start=None,
+                    stride=stride,
+                    padding=padding,
+                    size_y=pool_size_y,
+                    stride_y=stride_y,
+                    padding_y=padding_y,
+                    size_z=pool_size_z,
+                    stride_z=stride_z,
+                    padding_z=padding_z))
+        ],
+        ceil_mode=ceil_mode,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.POOL_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
+
+
 @wrap_name_default("spp")
 @layer_support()
 def spp_layer(input,
@@ -4069,8 +4223,12 @@ def __cost_input__(input, label, weight=None):
     """
     inputs and parents for cost layers.
     """
-    ipts = [Input(input.name), Input(label.name)]
-    parents = [input, label]
+    if isinstance(input, LayerOutput):
+        input = [input]
+    if isinstance(label, LayerOutput):
+        label = [label]
+    ipts = [Input(ipt.name) for ipt in (input + label)]
+    parents = [ipt for ipt in (input + label)]
     if weight is not None:
         assert weight.size == 1
         ipts.append(Input(weight.name))
@@ -5057,17 +5215,6 @@ def warp_ctc_layer(input,
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
 
-    To use warp_ctc layer, you need to specify the path of :code:`libwarpctc.so`,
-    using following methods:
-
-    1. Set it in :code:`paddle.init` (python api) or :code:`paddle_init` (c api),
-    such as :code:`paddle.init(use_gpu=True,
-    warpctc_dir=your_paddle_source_dir/third_party/install/warpctc/lib)`.
-
-    2. Set environment variable LD_LIBRARY_PATH on Linux or DYLD_LIBRARY_PATH
-    on Mac OS. For instance, :code:`export
-    LD_LIBRARY_PATH=your_paddle_source_dir/third_party/install/warpctc/lib:$LD_LIBRARY_PATH`.
-
     More details of CTC can be found by referring to `Connectionist Temporal
     Classification: Labelling Unsegmented Sequence Data with Recurrent
     Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
@@ -5644,16 +5791,77 @@ def sum_cost(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
+def huber_regression_cost(input,
+                          label,
+                          name=None,
+                          delta=1.0,
+                          coeff=1.0,
+                          layer_attr=None):
     """
-    A loss layer for huber loss.
+    In statistics, the Huber loss is a loss function used in robust regression, 
+    that is less sensitive to outliers in data than the squared error loss. 
+    Given a prediction f(x), a label y and :math:`\delta`, the loss function 
+    is defined as:
+
+    .. math:
+       loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
+       loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
 
     The example usage is:
 
     .. code-block:: python
 
-       cost = huber_cost(input=input_layer,
-                         label=label_layer)
+       cost = huber_regression_cost(input=input_layer, label=label_layer)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param label: The input label.
+    :type input: LayerOutput.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param delta: The difference between the observed and predicted values.
+    :type delta: float.
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput.
+    """
+    assert isinstance(input, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.HUBER_REGRESSION,
+        inputs=[input.name, label.name],
+        delta=delta,
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def huber_classification_cost(input,
+                              label,
+                              name=None,
+                              coeff=1.0,
+                              layer_attr=None):
+    """
+    For classification purposes, a variant of the Huber loss called modified Huber 
+    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and 
+    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber 
+    loss is defined as:
+
+    .. math:
+       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1 
+       loss = -4yf(x), \text{otherwise}
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = huber_classification_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
@@ -5673,11 +5881,12 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         assert input.size == 1
     Layer(
         name=name,
-        type=LayerType.HUBER,
+        type=LayerType.HUBER_CLASSIFICATION,
         inputs=[input.name, label.name],
         coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1)
+    return LayerOutput(
+        name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
 
 
 @wrap_name_default()
@@ -5713,10 +5922,10 @@ def multi_binary_label_cross_entropy(input,
 
     if input.activation is None or \
             not isinstance(input.activation, SigmoidActivation):
-        logger.log(
-            logging.WARN,
-            "%s is not recommend for multi_binary_label_cross_entropy's activation, "
-            "maybe the sigmoid is better" % repr(input.activation))
+        logger.log(logging.WARN,
+                   ("%s is not a recommended activation for "
+                    "multi_binary_label_cross_entropy, sigmoid is better") %
+                   repr(input.activation))
 
     Layer(
         name=name,
@@ -5731,6 +5940,113 @@ def multi_binary_label_cross_entropy(input,
         size=1)
 
 
+class BeamInput(object):
+    """
+    Define the input for cross_entropy_over_beam layer.
+
+    A beam is made up of a triple: the first one is scores over all
+    candidates; the second one is indices of top k selected candidates; the
+    third one is the index of ground truth, which is also always called
+    gold.
+    """
+
+    def __init__(self, candidate_scores, selected_candidates, gold):
+        assert isinstance(candidate_scores, LayerOutput)
+        self.candidate_scores = candidate_scores
+        assert candidate_scores.size == 1
+
+        assert isinstance(selected_candidates, LayerOutput)
+        self.selected_candidates = selected_candidates
+
+        assert isinstance(gold, LayerOutput)
+        self.gold = gold
+
+
+@wrap_name_default()
+@layer_support()
+def cross_entropy_over_beam(input, name=None):
+    """
+    This layer is used in learning to search models, which is to solve complex
+    joint prediction problems based on learning to search through a
+    problem-defined search space.
+
+    Specifically, the learning to search process for this layer begins with
+    searching a target sequence from a nested sequence. In the first search
+    step, top beam size sequences with highest scores, indices of these top k
+    sequences in the original nested sequence, and the ground truth (also
+    called gold) altogether (a triple) make up of the first beam.
+
+    Then, several special positions, for example, start and end positions
+    that define meaningful segments are searched. In these searches, top k
+    positions with highest scores are selected, and then sequence, starting
+    from the selected starts till ends of the sequences (or a fixed position)
+    are taken to search next.
+
+    We call the possible top k results returned in one search the beam. This
+    search process can be repeated for pre-defined turns and leads to several
+    beam expansions.
+
+    Finally, the layer cross_entropy_over_beam takes all the beam expansions
+    which contain several candidate targets found along the multi-step search.
+    cross_entropy_over_beam calculates cross entropy over the expanded beams
+    which all the candidates in the beam as the normalized factor.
+
+    Note that, if gold falls off the beam at search step t, then the cost is
+    calculated over the beam at step t.
+
+    This cost layer always works together with kmax_seq_score_layer,
+    sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
+    sub-search space.
+
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = cross_entropy_over_beam(input=[
+           BeamInput(
+               candidate_scores=beam1_candidates,
+               selected_candidates=beam1_topk,
+               gold=gold1),
+           BeamInput(
+               candidate_scores=beam2_candidates,
+               selected_candidates=beam2_topk,
+               gold=gold2),
+       ])
+
+
+    :param input: input beams for this layer.
+    :type input: BeamInput
+    :param name: input beams for this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    if isinstance(input, BeamInput):
+        input = [input]
+    else:
+        assert isinstance(input, list), (
+            'input for cross_entropy_over_beam shold be a python list '
+            'of BeamInput object.')
+        for ipt in input:
+            assert isinstance(ipt, BeamInput), (
+                'input for cross_entropy_over_beam '
+                'should be a BeamInput object.')
+
+    ipts = []
+    parents = []
+    for beam in input:
+        parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
+        ipts += [
+            beam.candidate_scores.name, beam.selected_candidates.name,
+            beam.gold.name
+        ]
+
+    Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
+
+
 @wrap_name_default()
 @layer_support()
 def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
@@ -6281,14 +6597,14 @@ def seq_slice_layer(input, starts, ends, name=None):
 
 @wrap_name_default()
 @layer_support()
-def kmax_sequence_score_layer(input, name=None, beam_size=1):
+def kmax_seq_score_layer(input, name=None, beam_size=1):
     """
     This layer accepts one input which are scores over a sequence or a nested
     sequence, and returns indices of beam_size sequences with highest scores.
 
     .. code-block:: python
 
-        kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size)
+        kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
 
 
     :param name: The Layer Name.
@@ -6301,10 +6617,10 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer "
+    assert isinstance(input, LayerOutput), ("kmax_seq_score_layer "
                                             "accepts only one input.")
     assert input.size == 1, (
-        "input of kmax_sequence_score_layer is a score"
+        "input of kmax_seq_score_layer is a score "
         "over a sequence or a nested sequence, so its width must be 1.")
 
     Layer(
@@ -6317,6 +6633,149 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
         name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
 
 
+@wrap_name_default("conv3d")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default(act=ReluActivation())
+@layer_support(DROPOUT)
+def img_conv3d_layer(input,
+                     filter_size,
+                     num_filters,
+                     name=None,
+                     num_channels=None,
+                     act=None,
+                     groups=1,
+                     stride=1,
+                     padding=0,
+                     bias_attr=None,
+                     param_attr=None,
+                     shared_biases=True,
+                     layer_attr=None,
+                     trans=False,
+                     layer_type=None):
+    """
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv3d_layer(input=data, filter_size=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Layer Input.
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel. Or input a list.
+    :type filter_size: int|tuple|list
+    :param num_filters: Each filter group's number of filter
+    :param act: Activation type. Default is tanh
+    :type act: BaseActivation
+    :param groups: Group size of filters.
+    :type groups: int
+    :param stride: The x dimension of the stride. Or input a tuple for two image
+                   dimension.
+    :type stride: int|tuple|list
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension
+    :type padding: int|tuple|list
+    :param bias_attr: Convolution bias attribute. None means default bias.
+                      False means no bias.
+    :type bias_attr: ParameterAttribute|False
+    :param num_channels: number of input channels. If None will be set
+                        automatically from previous output.
+    :type num_channels: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :param shared_biases: Is biases will be shared between filters or not.
+    :type shared_biases: bool
+    :param layer_attr: Layer Extra Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or
+                       "cudnn_conv"
+    :type layer_type: String
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if isinstance(filter_size, collections.Sequence):
+        assert len(filter_size) == 3
+        filter_size, filter_size_y, filter_size_z = filter_size
+    else:
+        filter_size_y = filter_size
+        filter_size_z = filter_size
+
+    if isinstance(stride, collections.Sequence):
+        assert len(stride) == 3
+        stride, stride_y, stride_z = stride
+    else:
+        stride_y = stride
+        stride_z = stride
+
+    if isinstance(padding, collections.Sequence):
+        assert len(padding) == 3
+        padding, padding_y, padding_z = padding
+    else:
+        padding_y = padding
+        padding_z = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    if layer_type:
+        if trans:
+            assert layer_type in ["deconv3d"]
+        lt = layer_type
+    else:
+        lt = LayerType.DECONV3D_LAYER if trans else LayerType.CONV3D_LAYER
+
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name,
+            conv=Conv3D(
+                filter_size=filter_size,
+                padding=padding,
+                stride=stride,
+                channels=num_channels,
+                groups=groups,
+                filter_size_y=filter_size_y,
+                padding_y=padding_y,
+                stride_y=stride_y,
+                filter_size_z=filter_size_z,
+                padding_z=padding_z,
+                stride_z=stride_z),
+            **param_attr.attr),
+        active_type=act.name,
+        num_filters=num_filters,
+        bias=ParamAttr.to_bias(bias_attr),
+        shared_biases=shared_biases,
+        type=lt,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        lt,
+        parents=[input],
+        activation=act,
+        num_filters=num_filters,
+        size=l.config.size)
+
+
 @wrap_name_default("scale_shift")
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
old mode 100755
new mode 100644
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 1ca5c8a07ebb7a7d842445bbe75cc3bf7bfb295a..df872a90ff388f0d96cef44763dbd076bc768ab9 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -8,7 +8,8 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
-test_seq_slice_layer)
+test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
+test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
+test_conv3d_layer test_deconv3d_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..9fe2bc29d3cd06231b67102e28f7a49c28306958
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
@@ -0,0 +1,132 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 36288
+  active_type: ""
+  height: 48
+  width: 42
+  depth: 6
+}
+layers {
+  name: "conv3d_1"
+  type: "conv3d"
+  size: 24192
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_conv3d_1.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 21
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 3
+      img_size_z: 6
+    }
+  }
+  bias_parameter_name: "_conv3d_1.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 24
+  width: 21
+  depth: 3
+}
+layers {
+  name: "conv3d_2"
+  type: "conv3d"
+  size: 24192
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_conv3d_2.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 21
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 3
+      img_size_z: 6
+    }
+  }
+  bias_parameter_name: "_conv3d_2.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 24
+  width: 21
+  depth: 3
+}
+parameters {
+  name: "_conv3d_1.w0"
+  size: 1296
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_1.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_2.w0"
+  size: 1296
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_2.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "conv3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "conv3d_1"
+  layer_names: "conv3d_2"
+  input_layer_names: "data"
+  output_layer_names: "conv3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 05847344be60b4de42a7dd709914fd3da524d1ae..55ab464ddf88f55bfb7b93ec0a189d4e53633468 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -167,6 +167,20 @@ layers {
   softmax_selfnorm_alpha: 0.1
   coeff: 1.0
 }
+layers {
+  name: "__huber_regression_cost_0__"
+  type: "huber_regression"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  coeff: 1.0
+  delta: 1.0
+}
 layers {
   name: "huber_probs"
   type: "data"
@@ -180,8 +194,8 @@ layers {
   active_type: ""
 }
 layers {
-  name: "__huber_cost_0__"
-  type: "huber"
+  name: "__huber_classification_cost_0__"
+  type: "huber_classification"
   size: 1
   active_type: ""
   inputs {
@@ -300,7 +314,8 @@ output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
 output_layer_names: "__cross_entropy_0__"
 output_layer_names: "__cross_entropy_with_selfnorm_0__"
-output_layer_names: "__huber_cost_0__"
+output_layer_names: "__huber_regression_cost_0__"
+output_layer_names: "__huber_classification_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
 output_layer_names: "__nce_layer_0__"
@@ -324,9 +339,10 @@ sub_models {
   layer_names: "__lambda_cost_0__"
   layer_names: "__cross_entropy_0__"
   layer_names: "__cross_entropy_with_selfnorm_0__"
+  layer_names: "__huber_regression_cost_0__"
   layer_names: "huber_probs"
   layer_names: "huber_label"
-  layer_names: "__huber_cost_0__"
+  layer_names: "__huber_classification_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
   layer_names: "__sum_cost_0__"
   layer_names: "__nce_layer_0__"
@@ -349,7 +365,8 @@ sub_models {
   output_layer_names: "__lambda_cost_0__"
   output_layer_names: "__cross_entropy_0__"
   output_layer_names: "__cross_entropy_with_selfnorm_0__"
-  output_layer_names: "__huber_cost_0__"
+  output_layer_names: "__huber_regression_cost_0__"
+  output_layer_names: "__huber_classification_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
   output_layer_names: "__sum_cost_0__"
   output_layer_names: "__nce_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..a602569697e91b11b8d421ac359c2e523a00fa98
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
@@ -0,0 +1,207 @@
+type: "nn"
+layers {
+  name: "sentence_states"
+  type: "data"
+  size: 32
+  active_type: ""
+}
+layers {
+  name: "sentence_scores"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__kmax_seq_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  beam_size: 5
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 32
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_states"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_0__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__sub_nested_seq_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_seq_score_layer_1__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  beam_size: 5
+}
+layers {
+  name: "__seq_slice_layer_0__"
+  type: "seq_slice"
+  size: 32
+  active_type: ""
+  inputs {
+    input_layer_name: "__sub_nested_seq_layer_0__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_1__"
+  }
+  select_first: true
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__seq_slice_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+layers {
+  name: "__kmax_seq_score_layer_2__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+  }
+  beam_size: 5
+}
+layers {
+  name: "sentences_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "start_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "end_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__cross_entropy_over_beam_0__"
+  type: "cross_entropy_over_beam"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_0__"
+  }
+  inputs {
+    input_layer_name: "sentences_ids"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_1__"
+  }
+  inputs {
+    input_layer_name: "start_ids"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_2__"
+  }
+  inputs {
+    input_layer_name: "end_ids"
+  }
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "sentence_scores"
+input_layer_names: "sentences_ids"
+input_layer_names: "sentence_states"
+input_layer_names: "start_ids"
+input_layer_names: "end_ids"
+output_layer_names: "__cross_entropy_over_beam_0__"
+sub_models {
+  name: "root"
+  layer_names: "sentence_states"
+  layer_names: "sentence_scores"
+  layer_names: "__kmax_seq_score_layer_0__"
+  layer_names: "__sub_nested_seq_layer_0__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_seq_score_layer_1__"
+  layer_names: "__seq_slice_layer_0__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__kmax_seq_score_layer_2__"
+  layer_names: "sentences_ids"
+  layer_names: "start_ids"
+  layer_names: "end_ids"
+  layer_names: "__cross_entropy_over_beam_0__"
+  input_layer_names: "sentence_scores"
+  input_layer_names: "sentences_ids"
+  input_layer_names: "sentence_states"
+  input_layer_names: "start_ids"
+  input_layer_names: "end_ids"
+  output_layer_names: "__cross_entropy_over_beam_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..7bf409731cbf8d5d98341b03c7c09d91fa8328d9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
@@ -0,0 +1,132 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 36288
+  active_type: ""
+  height: 48
+  width: 42
+  depth: 6
+}
+layers {
+  name: "deconv3d_1"
+  type: "deconv3d"
+  size: 1387760
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_deconv3d_1.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 16
+      output_x: 42
+      img_size: 83
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 48
+      img_size_y: 95
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 6
+      img_size_z: 11
+    }
+  }
+  bias_parameter_name: "_deconv3d_1.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 95
+  width: 83
+  depth: 11
+}
+layers {
+  name: "deconv3d_2"
+  type: "deconv3d"
+  size: 1387760
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_deconv3d_2.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 16
+      output_x: 42
+      img_size: 83
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 48
+      img_size_y: 95
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 6
+      img_size_z: 11
+    }
+  }
+  bias_parameter_name: "_deconv3d_2.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 95
+  width: 83
+  depth: 11
+}
+parameters {
+  name: "_deconv3d_1.w0"
+  size: 6912
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_1.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_2.w0"
+  size: 6912
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_2.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "deconv3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "deconv3d_1"
+  layer_names: "deconv3d_2"
+  input_layer_names: "data"
+  output_layer_names: "deconv3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
index 3d32220bfbf5f4c67f88303cb9773ecfa484da4b..f93d368c8687573db80106b9cc4defa56a881e46 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
@@ -17,7 +17,7 @@ layers {
   bias_parameter_name: "___fc_layer_0__.wbias"
 }
 layers {
-  name: "__kmax_sequence_score_layer_0__"
+  name: "__kmax_seq_score_layer_0__"
   type: "kmax_seq_score"
   active_type: ""
   inputs {
@@ -46,14 +46,14 @@ parameters {
   initial_smart: false
 }
 input_layer_names: "input_seq"
-output_layer_names: "__kmax_sequence_score_layer_0__"
+output_layer_names: "__kmax_seq_score_layer_0__"
 sub_models {
   name: "root"
   layer_names: "input_seq"
   layer_names: "__fc_layer_0__"
-  layer_names: "__kmax_sequence_score_layer_0__"
+  layer_names: "__kmax_seq_score_layer_0__"
   input_layer_names: "input_seq"
-  output_layer_names: "__kmax_sequence_score_layer_0__"
+  output_layer_names: "__kmax_seq_score_layer_0__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8eb98593f6f692a445cf5088e101e9da3763b41d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
@@ -0,0 +1,123 @@
+type: "nn"
+layers {
+  name: "data_2d"
+  type: "data"
+  size: 6000
+  active_type: ""
+  height: 20
+  width: 10
+}
+layers {
+  name: "pool___2d"
+  type: "pool"
+  size: 840
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2d"
+    pool_conf {
+      pool_type: "avg-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+    }
+  }
+  height: 7
+  width: 4
+}
+layers {
+  name: "data_3d_1"
+  type: "data"
+  size: 60000
+  active_type: ""
+  height: 20
+  width: 10
+  depth: 10
+}
+layers {
+  name: "pool_3d_1"
+  type: "pool3d"
+  size: 3360
+  active_type: ""
+  inputs {
+    input_layer_name: "data_3d_1"
+    pool_conf {
+      pool_type: "avg-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+      size_z: 5
+      stride_z: 3
+      output_z: 4
+      img_size_z: 10
+      padding_z: 1
+    }
+  }
+  height: 7
+  width: 4
+  depth: 4
+}
+layers {
+  name: "pool_3d_2"
+  type: "pool3d"
+  size: 3360
+  active_type: ""
+  inputs {
+    input_layer_name: "data_3d_1"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+      size_z: 5
+      stride_z: 3
+      output_z: 4
+      img_size_z: 10
+      padding_z: 1
+    }
+  }
+  height: 7
+  width: 4
+  depth: 4
+}
+input_layer_names: "data_2d"
+output_layer_names: "pool___2d"
+output_layer_names: "pool_3d_1"
+output_layer_names: "pool_3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data_2d"
+  layer_names: "pool___2d"
+  layer_names: "data_3d_1"
+  layer_names: "pool_3d_1"
+  layer_names: "pool_3d_2"
+  input_layer_names: "data_2d"
+  output_layer_names: "pool___2d"
+  output_layer_names: "pool_3d_1"
+  output_layer_names: "pool_3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa0a2c0d5fe19b6c414acd708bb6e82d9fb6568f
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
@@ -0,0 +1,49 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+num_channels = 3
+filter_size = 3
+filter_size_y = 3
+filter_size_z = 3
+stride = 2
+stride_y = 2
+stride_z = 2
+padding = 1
+padding_y = 1
+padding_z = 1
+groups = 1
+
+data = data_layer(
+    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
+# first
+conv3d_1 = img_conv3d_layer(
+    input=data,
+    name='conv3d_1',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=filter_size,
+    stride=stride,
+    padding=padding,
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=False,
+    layer_type="conv3d",
+    act=LinearActivation())
+# second
+conv3d_2 = img_conv3d_layer(
+    input=data,
+    name='conv3d_2',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=[filter_size, filter_size_y, filter_size_z],
+    stride=[stride, stride_y, stride_z],
+    padding=[padding, padding_y, padding_z],
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=False,
+    layer_type="conv3d",
+    act=LinearActivation())
+outputs(conv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index d2a3b702a1d7b650947b344e4719098f68d4dd73..7ce375c708af7b0b7ae1d700dedbdb6a4ce16c7f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -33,7 +33,9 @@ outputs(
         input=probs, label=xe_label),
     cross_entropy_with_selfnorm(
         input=probs, label=xe_label),
-    huber_cost(
+    huber_regression_cost(
+        input=seq_in, label=labels),
+    huber_classification_cost(
         input=data_layer(
             name='huber_probs', size=1),
         label=data_layer(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5bdf1181dc4538418a8b89b41a1ff713e423c8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+#coding=utf-8
+
+from paddle.trainer_config_helpers import *
+beam_size = 5
+
+# the first beam expansion.
+sentence_states = data_layer(name="sentence_states", size=32)
+sentence_scores = data_layer(name="sentence_scores", size=1)
+topk_sentence_ids = kmax_seq_score_layer(
+    input=sentence_scores, beam_size=beam_size)
+
+# the second beam expansion.
+topk_sen = sub_nested_seq_layer(
+    input=sentence_states, selected_indices=topk_sentence_ids)
+start_pos_scores = fc_layer(input=topk_sen, size=1, act=LinearActivation())
+topk_start_pos_ids = kmax_seq_score_layer(
+    input=sentence_scores, beam_size=beam_size)
+
+# the final beam expansion.
+topk_start_spans = seq_slice_layer(
+    input=topk_sen, starts=topk_start_pos_ids, ends=None)
+end_pos_scores = fc_layer(
+    input=topk_start_spans, size=1, act=LinearActivation())
+topk_end_pos_ids = kmax_seq_score_layer(
+    input=end_pos_scores, beam_size=beam_size)
+
+# define the cost
+sentence_idx = data_layer(name="sentences_ids", size=1)
+start_idx = data_layer(name="start_ids", size=1)
+end_idx = data_layer(name="end_ids", size=1)
+cost = cross_entropy_over_beam(input=[
+    BeamInput(
+        candidate_scores=sentence_scores,
+        selected_candidates=topk_sentence_ids,
+        gold=sentence_idx), BeamInput(
+            candidate_scores=start_pos_scores,
+            selected_candidates=topk_start_pos_ids,
+            gold=start_idx), BeamInput(
+                candidate_scores=end_pos_scores,
+                selected_candidates=topk_end_pos_ids,
+                gold=end_idx)
+])
+
+outputs(cost)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a113279fc17b49ad01b8860b61180af0f35694fb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
@@ -0,0 +1,50 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+num_channels = 3
+filter_size = 3
+filter_size_y = 3
+filter_size_z = 3
+stride = 2
+stride_y = 2
+stride_z = 2
+padding = 1
+padding_y = 1
+padding_z = 1
+groups = 1
+
+data = data_layer(
+    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
+
+# first
+deconv3d_1 = img_conv3d_layer(
+    input=data,
+    name='deconv3d_1',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=filter_size,
+    stride=stride,
+    padding=padding,
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=True,
+    layer_type="deconv3d",
+    act=LinearActivation())
+# second
+deconv3d_2 = img_conv3d_layer(
+    input=data,
+    name='deconv3d_2',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=[filter_size, filter_size_y, filter_size_z],
+    stride=[stride, stride_y, stride_z],
+    padding=[padding, padding_y, padding_z],
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=True,
+    layer_type="deconv3d",
+    act=LinearActivation())
+outputs(deconv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
index 48d0cd55da2481743de66ea95190c0856e7ddc39..171da10f75dae03eed7e110d0efd07d6a18e1ecf 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -4,6 +4,6 @@ from paddle.trainer_config_helpers import *
 
 data = data_layer(name="input_seq", size=128)
 scores = fc_layer(input=data, size=1, act=ExpActivation())
-kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
+kmax_seq_id = kmax_seq_score_layer(input=scores, beam_size=5)
 
 outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dbb921d41986e711d5b8b31caab1f8b6bdc47b8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
@@ -0,0 +1,38 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100, learning_rate=1e-5)
+
+data_2d = data_layer(name='data_2d', size=6000, height=20, width=10)
+
+pool_2d = img_pool_layer(
+    name="pool___2d",
+    input=data_2d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_2d)
+
+data_3d = data_layer(
+    name='data_3d_1', size=60000, depth=10, height=20, width=10)
+
+pool_3d_1 = img_pool3d_layer(
+    name="pool_3d_1",
+    input=data_3d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_3d_1)
+
+pool_3d_2 = img_pool3d_layer(
+    name="pool_3d_2",
+    input=data_3d,
+    num_channels=30,
+    pool_size=[5, 5, 5],
+    stride=[3, 3, 3],
+    padding=[1, 1, 1],
+    pool_type=MaxPooling())
+outputs(pool_3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
similarity index 100%
rename from python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
rename to python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
index 05902ea293df5a3e9c10f6700930ca6a343603c2..b3dd8f8fc784754e749240e1b895b11ef6aba438 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -17,3 +17,4 @@ from paddle.trainer.config_parser import parse_config_and_serialize
 if __name__ == '__main__':
     parse_config_and_serialize(
         'trainer_config_helpers/tests/layers_test_config.py', '')
+# layers_test_config.py
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 5bea980611904b37a4a5d4e2cbbee13503a61ff0..1c8d8f4b2f626bea5d9a44d01de7c2c9c45dc2fb 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -78,6 +78,8 @@ def init(**kwargs):
 
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    if 'use_mkldnn' in kwargs:
+        cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
     assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
                                          "supported in v2 APIs.")
 
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 9a7a7fbf5e63d4e433576f8e980c41c72fa26cab..518f828bacd60e7cb8375b22c6c3296f9bfeb5ea 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -268,7 +268,7 @@ class GradientChecker(unittest.TestCase):
         :param input_vars: numpy value of input variable. The following
             computation will use these variables.
         :param inputs_to_check: inputs var names that should check gradient.
-        :param output_name: output name that used to
+        :param output_name: the output variable name of forward network.
         :param max_relative_error: The relative tolerance parameter.
         :param no_grad_set: used when create backward ops
         :param only_cpu: only compute and check gradient on cpu kernel.
diff --git a/python/paddle/v2/framework/tests/mnist.py b/python/paddle/v2/framework/tests/mnist.py
index 9a0b109850e92c66e69f74c5cd0853a09b5551a1..a68f302f9c344bf6d63e8d9b48836d69338c3d0b 100644
--- a/python/paddle/v2/framework/tests/mnist.py
+++ b/python/paddle/v2/framework/tests/mnist.py
@@ -181,7 +181,7 @@ images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
 labels = data_layer(name='label', dims=[BATCH_SIZE])
 fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
 fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
-predict = fc_layer(net=forward_net, input=fc2, size=100, act="softmax")
+predict = fc_layer(net=forward_net, input=fc2, size=10, act="softmax")
 cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
 
 init_net.complete_add_op(True)
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 4dcc3ab57e7e6dfbe040ac61025e55b9e48b4415..8acea6155c588f2e8e5ad009cd8f0a0c09afb92b 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -70,7 +70,7 @@ class Inference(object):
                 item = [each_result[each_field] for each_field in field]
                 yield item
 
-    def infer(self, input, field='value', **kwargs):
+    def infer(self, input, field='value', flatten_result=True, **kwargs):
         """
         Infer a data by model.
         :param input: input data batch. Should be python iterable object.
@@ -83,7 +83,10 @@ class Inference(object):
                 retv = [[] for i in xrange(len(result))]
             for i, item in enumerate(result):
                 retv[i].append(item)
-        retv = [numpy.concatenate(out) for out in retv]
+
+        if flatten_result:
+            retv = [numpy.concatenate(out) for out in retv]
+
         if len(retv) == 1:
             return retv[0]
         else:
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index b8af5abaeada49a3e8951c21c9065aaf4d1ab851..4cfd91882e2d5f0098d27b8897359152ddd94dda 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from collections import OrderedDict
 import paddle.trainer.config_parser as cp
 import struct
 import tarfile
@@ -42,9 +43,25 @@ def create(layers):
 
 class Parameters(object):
     """
-    Parameters is a dictionary contains Paddle's parameter. The key of
-    Parameters is the name of parameter. The value of Parameters is a plain
-    :code:`numpy.ndarry` .
+    `Parameters` manages all the learnable parameters in a neural network.
+    It stores parameters' information in an OrderedDict. The key is
+    the name of a parameter, and value is a parameter's configuration(in
+    protobuf format), such as initialization mean and std, its size, whether it
+    is a static parameter, and so on.
+
+    :param __param_conf__: store the configurations of learnable parameters in
+        the network in an OrderedDict. Parameter is added one by one into the
+        dict by following their created order in the network: parameters of
+        the previous layers in a network are careted first. You can visit the
+        parameters from bottom to top by iterating over this dict.
+    :type __param_conf__: OrderedDict
+    :param __gradient_machines__: all of the parameters in a neural network are
+        appended to a PaddlePaddle gradient machine, which is used internally to
+        copy parameter values between C++ and Python end.
+    :type __gradient_machines__: list
+    :param __tmp_params__: a dict to store dummy parameters if no
+        __gradient_machines__ is appended to `Parameters`.
+    :type __tmp_params__: dict
 
     Basically usage is
 
@@ -62,7 +79,7 @@ class Parameters(object):
     """
 
     def __init__(self):
-        self.__param_conf__ = dict()
+        self.__param_conf__ = OrderedDict()
         self.__gradient_machines__ = []
         self.__tmp_params__ = dict()
 
@@ -231,6 +248,9 @@ class Parameters(object):
         :rtype: np.ndarray
         """
         import py_paddle.swig_paddle as api
+        if self.__param_conf__[key].is_static:
+            return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
+
         return self.__getter_inner(key, api.PARAMETER_GRADIENT)
 
     def set(self, parameter_name, value):
@@ -250,7 +270,7 @@ class Parameters(object):
         append gradient machine to parameters. This method is used internally in
         Trainer.train.
 
-        :param gradient_machine: Paddle C++ GradientMachine object.
+        :param gradient_machine: PaddlePaddle C++ GradientMachine object.
         :type gradient_machine: api.GradientMachine
         :return:
         """
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index f2097e195f41637977e71f65f36dad005d3e7941..783a0ca85dc61b9f00ac8126e03788884dfb44cb 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -141,12 +141,13 @@ class CostLayerTest(unittest.TestCase):
         cost8 = layer.rank_cost(left=score, right=score, label=score)
         cost9 = layer.lambda_cost(input=inference, score=score)
         cost10 = layer.sum_cost(input=inference)
-        cost11 = layer.huber_cost(input=score, label=label)
+        cost11 = layer.huber_regression_cost(input=score, label=label)
+        cost12 = layer.huber_classification_cost(input=score, label=label)
 
         print layer.parse_network([cost1, cost2])
         print layer.parse_network([cost3, cost4])
         print layer.parse_network([cost5, cost6])
-        print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
+        print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
 
         crf = layer.crf(input=inference, label=label)
         crf_decoding = layer.crf_decoding(input=inference, size=3)