Merge branch 'build_android_clang' of https://github.com/Xreki/Paddle into build_android_clang

1e76feea · Liu Yiqun · 8a4fad42 · 8b15ac82 · 1e76feea · 1e76feea
164 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: ./.clang_format.hook -i
+        entry: bash ./.clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang

--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_STYLE_CHECK
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 # Add bash enhancements

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -47,8 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-21 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
@@ -54,7 +54,8 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                ${MKLML_SOURCE_DIR}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
+                          && tar zxf ${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
    UPDATE_COMMAND        ""
    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}

--- a/doc/about/index_cn.md
+++ b/doc/about/index_cn.md
-关于PaddlePaddle
-================
-PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
-PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
-同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
-致谢
--------
-在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
-ABOUT
-=======
-PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
-which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
-PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
-We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
-Credits
--------
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:
+seq_slice
+---------
+..  autoclass:: paddle.v2.layer.seq_slice
+    :noindex:
 kmax_sequence_score
 -------------------
 ..  autoclass:: paddle.v2.layer.kmax_sequence_score
@@ -414,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
-huber_cost
+huber_regression_cost
----------
+-------------------------
-..  autoclass:: paddle.v2.layer.huber_cost
+..  autoclass:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+huber_classification_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost

--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
 <img src="src/paddle-task-states.png"/>
 1. When a new pass of training starts, all tasks will be placed in the todo queue.
-1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
-1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
-1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
 ### Trainer Process
 The trainer process will:
- Receive tasks from the master.
+- Request tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+- Work on the tasks
+- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
 ### Parameter Server Process
@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
-1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
+1. Write its ip address to */master/addr* so that trainers can discover it.
-1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
 When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
 When the trainer is started by the Kubernetes, it executes the following steps at startup:
-1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
-1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
+1. Finds and watches */master/addr* to get master's address.
-1. Waits for tasks from the master to start training.
+1. Requests for tasks from the master to start training.
-If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
-When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
 ### Parameter Server Process

--- a/doc/design/cluster_train/src/paddle-etcd.graffle
+++ b/doc/design/cluster_train/src/paddle-etcd.graffle
--- a/doc/design/cluster_train/src/paddle-etcd.png
+++ b/doc/design/cluster_train/src/paddle-etcd.png
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,14 +6,12 @@
 安装流程
 ++++++++
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+PaddlePaddle提供Docker镜像来部署环境。
 .. toctree::
   :maxdepth: 1
   docker_install_cn.rst 
-   ubuntu_install_cn.rst
 编译流程

--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -8,14 +8,13 @@ Install PaddlePaddle
    :maxdepth: 1
    docker_install_en.rst
-    ubuntu_install_en.rst
 Build from Source
 -----------------
 ..  warning::
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
 ..  toctree::
    :maxdepth: 1

--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
-Ubuntu部署PaddlePaddle
-===================================
-PaddlePaddle提供了ubuntu 14.04 deb安装包。
-安装
------
-安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
-它包含四个版本\:
-* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
-* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
-* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
-* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
-下载完相关安装包后，执行:
-..  code-block:: shell
-    sudo apt-get install gdebi
-    gdebi paddle-*-cpu.deb
-或者:
-..  code-block:: shell
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
-..  code-block:: shell
-    PaddlePaddle 0.8.0b1, compiled with
-        with_avx: ON
-        with_gpu: OFF
-        with_double: OFF
-        with_python: ON
-        with_rdma: OFF
-        with_timer: OFF
-        with_predict_sdk:
-可能遇到的问题
--------------
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-安装完成后，运行 :code:`paddle train` 报错\:
-..  code-block:: shell
-      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
-..  code-block:: shell
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_en.rst
-Debian Package installation guide
-=================================
-PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
-There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
-After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
-..	code-block:: bash
-	gdebi paddle-*.deb
-If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
-Or you can use following commands to install PaddlePaddle.
-..	code-block:: bash
-	dpkg -i paddle-*.deb
-	apt-get install -f
-And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
+# 编译PaddlePaddle和运行单元测试
+## 需要的软硬件
+为了开发PaddlePaddle，我们需要
+1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
+1. Docker。
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
+## 总体流程
+1. 获取源码
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+2. 安装开发工具到 Docker image 里
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
+3. 编译
+   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev
+   ```
+   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
+   ```bash
+   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+4. 运行单元测试
+   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
+   ```bash
+   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
+   ```bash
+   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+5. 清理
+   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
+   ```bash
+   rm -rf build
+   ```
+## 为什么要 Docker 呀？
+- 什么是 Docker?
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+- Docker 还是虚拟机？
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+- 为什么用 Docker?
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+- 我可以选择不用Docker吗？
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+- 学习 Docker 有多难？
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+- 我可以用 IDE 吗？
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+- 可以并行编译吗？
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+## 可能碰到的问题
+- Docker 需要 sudo
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+- 在 Windows/MacOS 上编译很慢
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+- 磁盘不够
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
+# Build PaddlePaddle from Source Code and Run Unit Test
+## What Developers Need
+To contribute to PaddlePaddle, you need
+1. A computer -- Linux, BSD, Windows, MacOS, and
+1. Docker.
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
+## General Process
+1. Retrieve source code.
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+2. Install build tools into a Docker image.
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
+3. Build from source.
+   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev
+   ```
+   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
+   ```bash
+   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+4. Run unit tests.
+   To run all unit tests using the first GPU of a node:
+   ```bash
+   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   Sometimes we want to run a specific unit test, say `memory_test`, we can run
+   ```bash
+   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+5. Clean Build.
+   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
+   ```bash
+   rm -rf build
+   ```
+## Docker, Or Not?
+- What is Docker?
+  If you haven't heard of it, consider it something like Python's virtualenv.
+- Docker or virtual machine?
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+- Why Docker?
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+- Can I choose not to use Docker?
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
+- How difficult is it to learn Docker?
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+- Can I use my favorite IDE?
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+- Does Docker do parallel building?
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+## Some Gotchas
+- Docker requires sudo
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+- Docker on Windows/MacOS builds slowly
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+- Not enough disk space
+  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
+# 如何写新的Operator
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现C++类)
+   - [定义ProtoMaker类](#定义ProtoMaker类)
+   - [定义Operator类](#定义Operator类)
+   - [定义OpKernel类](#定义OpKernel类)
+   - [注册Operator](#注册Operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定Python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向Operator单测)
+   - [反向Operator单测](#反向Operator单测)
+   - [编译和执行](#编译和执行)
+## 概念简介
+简单介绍需要用到基类，详细介绍请参考设计文档。
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+ 内容            | 定义位置         
+--------------  | :----------------------  
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件 
+Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
+注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+## 实现C++类
+### 1. 定义ProtoMaker类
+矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
+```
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
+在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
+```
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+ 在这个例子里，两处不同：
+  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
+  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+### 2. 定义Operator类
+```c++
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+```c++
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+```c++
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```	
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+### 3. 定义OpKernel类
+```C++
+template <typename Place, typename T>
+class MulKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+};
+```
+`MulKernel`继承自`framework::OpKernel`，带有模板参数:
+  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+ - `typename T` : 表示数据类型，如`float`, `double`等。
+`MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
+注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
+到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
+### 4. 注册Operator
+在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+```
+  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
+  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+在 `.cu`文件中注册GPU Kernel。
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+```
+### 5. 编译
+在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
+```
+op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+```
+下面命令可以编译：
+```
+make mul_op
+```
+## 绑定Python
+- 绑定Python 
+    在 [`paddle/pybind/pybind.cc 
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+    ```
+    USE_OP(mul);
+    ```
+    如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
+    ```
+    USE_CPU_ONLY_OP(gather);
+    ```
+    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
+ - 生成库
+   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
+   ```
+   if(WITH_PYTHON)
+     cc_library(paddle_pybind SHARED
+     SRCS pybind.cc
+     DEPS pybind python backward
+     mul_op
+     minus_op)
+   endif(WITH_PYTHON)
+   ```
+## 实现单元测试
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+### 前向Operator单测
+前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+```
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "mul"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+```
+   首先需要`import`必要的包,下面详细解释其他值：
+   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
+   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
+   - `self.outputs` : 定义输出，并得到Python结算结果。
+### 反向Operator单测
+反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
+ ```
+ class MulGradOpTest(GradientChecker):
+    def test_mul(self):
+        op = create_op("mul")
+        inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.compare_grad(op, inputs)      
+        # mul op will enlarge the relative error
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+ ```
+   - 调用`create_op("mul")`创建反向Op对应的前向Op。
+   - 定义输入`inputs`。
+   - 调用`compare_grad`函数对比CPU、GPU计算结果。
+   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
+      - 第一个参数`op` : 前向op。
+      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
+      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+### 编译和执行 
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+```
+py_test(test_mul_op SRCS test_mul_op.py)
+```
+编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+```
+make test ARGS="-R test_mul_op -V"
+```
+或者:
+```
+ctest -R test_mul_op
+```
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,6 +19,7 @@
 ..  toctree::
  :maxdepth: 1
+  dev/build_cn.rst
  dev/write_docs_cn.rst
  dev/contribute_to_paddle_cn.md

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,6 +18,7 @@ Development
 ..  toctree::
  :maxdepth: 1
+  dev/build_en.rst
  dev/new_layer_en.rst
  dev/contribute_to_paddle_en.md

--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
-  about/index_en.rst
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
 // WithEtcd sets the client to use etcd for master discovery.
 func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
 	return func(c *Client) error {
-		cli, err := clientv3.New(clientv3.Config{
+		var cli *clientv3.Client
-			Endpoints:   endpoints,
+		f := func() error {
-			DialTimeout: timeout,
+			var err error
-		})
+			cli, err = clientv3.New(clientv3.Config{
-		if err != nil {
+				Endpoints:   endpoints,
+				DialTimeout: timeout,
+			})
 			return err
 		}
+		for {
+			err := f()
+			if err != nil {
+				log.Warningln(err)
+			} else {
+				break
+			}
+			time.Sleep(time.Second)
+		}
 		ch := make(chan string, 1)
 		a, err := GetKey(cli, DefaultAddrPath, timeout)
@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		}
 	}
 	c.ch = make(chan record, c.bufSize)
-	// FIXME: connection is created asyncrosly in monitorMaster go routine,
-	//        ensure the connection is ready for use before calling c.addClient.
-	time.Sleep(time.Second)
 	return c, nil
 }

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 if(WITH_C_API)

--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -53,7 +53,10 @@ add_custom_target(paddle_capi_whole ALL
 set_target_properties(paddle_capi_whole
  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
+# TODO: merge mkl into paddle_capi_shared
 add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 link_paddle_exe(paddle_capi_shared)

--- a/paddle/capi/export.map
+++ b/paddle/capi/export.map
+{
+	global:
+		paddle_*;
+	local:
+		*;
+};
--- a/paddle/capi/export.sym
+++ b/paddle/capi/export.sym
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride);
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
 /**
 * @brief   Bilinear interpolation forward.
 *
@@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -18,14 +18,6 @@ limitations under the License. */
 #ifndef __NVCC__
-#include "paddle/math/MathFunctions.h"
-// #ifndef PADDLE_TYPE_DOUBLE
-// #define     CBLAS_GEMM     paddle::gemm<float>
-// #else
-// #define     CBLAS_GEMM     paddle::gemm<double>
-// #endif
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }
-template<class OpResetOutput, class OpFinalOutput>
-void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  if (value.prevOutValue) {
-//     CBLAS_GEMM(CblasNoTrans,
-//                CblasNoTrans,
-//                batchSize,
-//                2 * frameSize,
-//                frameSize,
-//                1,
-//                value.prevOutValue,
-//                frameSize,
-//                value.gateWeight,
-//                frameSize * 2,
-//                1,
-//                value.gateValue,
-//                frameSize * 3);
-  }
-  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
-  if (value.prevOutValue) {
-//    CBLAS_GEMM(CblasNoTrans,
-//               CblasNoTrans,
-//               batchSize,
-//               frameSize,
-//               frameSize,
-//               1,
-//               value.resetOutputValue,
-//               frameSize,
-//               value.stateWeight,
-//               frameSize,
-//               1,
-//               value.gateValue + frameSize * 2,
-//               frameSize * 3);
-  }
-  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
-}
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }
-template<class OpStateGrad, class OpResetGrad>
-void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  backward_state_grad(opStateGrad, value, grad,
-    frameSize, batchSize, active_node);
-  if (value.prevOutValue && grad.prevOutGrad) {
-//     CBLAS_GEMM(CblasNoTrans,
-//                CblasTrans,
-//                batchSize,
-//                frameSize,
-//                frameSize,
-//                1,
-//                grad.gateGrad + frameSize * 2,
-//                frameSize * 3,
-//                value.stateWeight,
-//                frameSize,
-//                0,
-//                grad.resetOutputGrad,
-//                frameSize);
-    if (grad.stateWeightGrad) {
-//       CBLAS_GEMM(CblasTrans,
-//                  CblasNoTrans,
-//                  frameSize,
-//                  frameSize,
-//                  batchSize,
-//                  1,
-//                  value.resetOutputValue,
-//                  frameSize,
-//                  grad.gateGrad + frameSize * 2,
-//                  frameSize * 3,
-//                  1,
-//                  grad.stateWeightGrad,
-//                  frameSize);
-    }
-  }
-  backward_reset_grad(opResetGrad, value, grad,
-    frameSize, batchSize, active_gate);
-  if (grad.prevOutGrad && value.prevOutValue) {
-//     CBLAS_GEMM(CblasNoTrans,
-//                CblasTrans,
-//                batchSize,
-//                frameSize,
-//                frameSize * 2,
-//                1,
-//                grad.gateGrad,
-//                frameSize * 3,
-//                value.gateWeight,
-//                frameSize * 2,
-//                1,
-//                grad.prevOutGrad,
-//                frameSize);
-    if (grad.gateWeightGrad) {
-//       CBLAS_GEMM(CblasTrans,
-//                  CblasNoTrans,
-//                  frameSize,
-//                  frameSize * 2,
-//                  batchSize,
-//                  1,
-//                  value.prevOutValue,
-//                  frameSize,
-//                  grad.gateGrad,
-//                  frameSize * 3,
-//                  1,
-//                  grad.gateWeightGrad,
-//                  frameSize * 2);
-    }
-  }
-}
 #endif
 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
                              int* convBwdDataAlgo,
                              size_t* bwdDataLimitBytes,
                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes);
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation);
 /**
 * @brief   destroy filter descriptor.
@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                             int stride_width);
+                                             int stride_width,
+                                             int dilation_h = 1,
+                                             int dilation_w = 1);
 /**
 * @brief   reset convolution descriptor.
@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                            int padding_height,
                                            int padding_width,
                                            int stride_height,
-                                            int stride_width);
+                                            int stride_width,
+                                            int dilation_h = 1,
+                                            int dilation_w = 1);
 /**
 * @brief   destroy convolution descriptor.

--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[out]   dataDst     output matrix.
+ *
+ */
+extern void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst);
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input
+ * @param[in]   alpha      input
+ *
+ */
+extern void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta);
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride) {}
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
 inline void hl_bilinear_forward(const real* inData,
                                const size_t inImgH,
                                const size_t inImgW,

--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                             int stride_width) {}
+                                             int stride_width,
+                                             int dilation_h,
+                                             int dilation_w) {}
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                            hl_tensor_descriptor image,
@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                            int padding_height,
                                            int padding_width,
                                            int stride_height,
-                                            int stride_width) {}
+                                            int stride_width,
+                                            int dilation_h,
+                                            int dilation_w) {}
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
                              int* convBwdDataAlgo,
                              size_t* bwdDataLimitBytes,
                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes) {}
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation) {}
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                   real* input_data,

--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
+inline void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst) {}
+inline void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                       int* convBwdDataAlgo,
                       size_t* bwdDataLimitBytes,
                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
 #if CUDNN_VERSION >= 4000
  CHECK_NOTNULL(input);
@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
  size_t memoryLimitBytes =
      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+  // For dilation
+  int algo = 0;
  // cudnn convolution forward configuration
  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+  if (useDilation) {
-      t_resource.cudnn_handle,
+    convFwdAlgo = &algo;
-      fwd_src_desc,
+    convBwdDataAlgo = &algo;
-      fwd_filter_desc,
+    convBwdFilterAlgo = &algo;
-      fwd_conv_desc,
+  } else {
-      fwd_dest_desc,
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        t_resource.cudnn_handle,
-      memoryLimitBytes,
+        fwd_src_desc,
-      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
      t_resource.cudnn_handle,
@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
      fwdLimitBytes));
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
      t_resource.cudnn_handle,
      bwd_data_filter_desc,
@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
      bwdDataLimitBytes));
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
      t_resource.cudnn_handle,
      bwd_filter_src_desc,
@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                      int stride_width) {
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
  CHECK_NOTNULL(conv);
  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                       padding_width,
                                                       stride_height,
                                                       stride_width,
-                                                       1,
+                                                       dilation_h,
-                                                       1,
+                                                       dilation_w,
                                                       mode,
                                                       data_type));
 #else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                       padding_height,
                                                       padding_width,
                                                       stride_height,
                                                       stride_width,
-                                                       1,
+                                                       dilation_h,
-                                                       1,
+                                                       dilation_w,
                                                       mode));
 #endif
@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                     int padding_height,
                                     int padding_width,
                                     int stride_height,
-                                     int stride_width) {
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
  CHECK_NOTNULL(conv);
  CHECK_NOTNULL(image);
  CHECK_NOTNULL(filter);
@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                       padding_width,
                                                       stride_height,
                                                       stride_width,
-                                                       1,
+                                                       dilation_h,
-                                                       1,
+                                                       dilation_w,
                                                       mode,
                                                       data_type));
 #else
@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                       padding_width,
                                                       stride_height,
                                                       stride_width,
-                                                       1,
+                                                       dilation_h,
-                                                       1,
+                                                       dilation_w,
                                                       mode));
 #endif

--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -592,3 +592,204 @@ void hl_matrix_rotate(
      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
-if(WITH_PYTHON)
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-    sgd_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op)
-endif(WITH_PYTHON)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -124,6 +124,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    std::list<Pos> insert_position;
    for (auto& dup_output_op : dup_output_ops) {
      const std::string& name = dup_output_op.first;
+      // duplicate @Empty@ don't need to be added
+      if (name == kEmptyVarName) continue;
      auto& dup_op = dup_output_op.second;
      // no duplicate output
      if (dup_op.size() == 1) continue;
@@ -209,7 +212,7 @@ std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size());
+  no_grad_names.reserve(no_grad_vars.size() + 1);
  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);

--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
-## Operator/expression 's Backward
+# Operator/expression 's Backward
-### Motivation
+## Motivation
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+## Backward Operator Registry
-### Implement : gradient operator registry
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
-|                        | forward operator | backward operator                |
+|                        | forward operator | backward operator 
-| ---------------------- | ---------------- | -------------------------------- |
+| ---------------------- | ---------------- |------------------------- |		
-| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
-| **Operator::outputs_** | Outputs          | InputGradients                   |
+| **Operator::outputs_** | Outputs          | InputGradients            |
-Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+ In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
-We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
-grad_op_builder(fengjiayi)
+```cpp
+REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
+```
-### Implement : Backward network
+`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
+`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
+## Backward Opeartor Creating
+Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+```cpp
+OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
+``` 
+The function `BuildGradOp` will sequentially execute following processes:
+1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
+4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
+## Backward Network Building
+A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
+In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 
 given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
-1. bla bla bla (yuyang)
+1. Op 
+   when the input forward network is a Op, return its gradient Operator Immediately.
 2. NetOp 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+   <p align="center">
+   <img src="./images/duplicate_op.png" width="70%" ><br/>
+   1. shared variable in two operators. 
+   </p>
+   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+   <p align="center">
+   <img src="images/duplicate_op2.png" width="90%" ><br/>
-   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+   2. replace shared variable gradient with `Add` Operator
-   ![./images/duplicate_op]()
+   </p>
-    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
-![./images/duplicate_op2]()
-	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
+	Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -72,8 +72,8 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
 class FcOp : public operators::NetOp {
 public:
-  FcOp(const std::string &type, const VarNameMap &inputs,
+  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VarNameMap &outputs, const AttributeMap &attrs)
+       const VariableNameMap &outputs, const AttributeMap &attrs)
      : NetOp(type, inputs, outputs, attrs) {
    AppendOp(OpRegistry::CreateOp("mul",
                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},

--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,13 +20,13 @@ namespace framework {
 enum class OpArgType { IN, OUT };
 static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, OperatorBase::VarNameMap* vars) {
+                       bool is_grad, VariableNameMap* vars) {
  const auto& src_inout =
      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
  auto& dst_inout = *vars;
-  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
  for (const auto& arg : src_arg_list) {
    if (arg.not_in_gradient() && !is_grad) continue;
    const std::string src_name = arg.name();
@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
 }
 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto it = OpRegistry::op_info_map().find(op->Type());
+  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+  PADDLE_ENFORCE(info.HasGradientOp());
-                 "'%s' has not been registered.", op->Type());
-  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
-                 op->Type());
-  std::string grad_op_type = it->second.grad_op_type_;
-  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
-                 op->Type());
-  OperatorBase::VarNameMap inputs;
+  VariableNameMap inputs;
-  OperatorBase::VarNameMap outputs;
+  VariableNameMap outputs;
  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-  it = OpRegistry::op_info_map().find(grad_op_type);
+  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-                 "'%s' has not been registered.", grad_op_type);
-  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
 }
 }  // namespace framework

--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/op_info.h"
+namespace paddle {
+namespace framework {
+static OpInfoMap* g_op_info_map = nullptr;
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include "paddle/framework/attribute.h"
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+struct OpInfo {
+  OpCreator creator_;
+  std::string grad_op_type_;
+  OpProto* proto_;
+  OpAttrChecker* checker_;
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+  const OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+  const OpAttrChecker& Checker() const {
+    PADDLE_ENFORCE_NOT_NULL(checker_,
+                            "Operator Checker has not been registered");
+    return *checker_;
+  }
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+};
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+  OpInfoMap(const OpInfoMap& o) = delete;
+  OpInfoMap(OpInfoMap&& o) = delete;
+  OpInfoMap& operator=(const OpInfoMap& o) = delete;
+  OpInfoMap& operator=(OpInfoMap&& o) = delete;
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+  const OpInfo& Get(const std::string& type) const {
+    auto it = map_.find(type);
+    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
+    return it->second;
+  }
+  template <typename Callback>
+  void IterAllInfo(Callback callback) {
+    for (auto& it : map_) {
+      callback(it.first, it.second);
+    }
+  }
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, const OpInfo> map_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -19,32 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
-                                                   const VarNameMap& inputs,
+    const std::string& type, const VariableNameMap& inputs,
-                                                   const VarNameMap& outputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
-                                                   AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
-  auto it = op_info_map().find(type);
+  info.Checker().Check(attrs);
-  PADDLE_ENFORCE(it != op_info_map().end(),
+  auto op = info.Creator()(type, inputs, outputs, attrs);
-                 "Operator '%s' has not been registered.", type);
-  it->second.checker_->Check(attrs);
-  auto op = it->second.creator_(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
 }
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
-  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-  VarNameMap ret_val;
+  VariableNameMap ret_val;
  for (auto& var : op_desc_vars) {
    auto& var_names = ret_val[var.parameter()];
    auto& var_names_in_proto = var.arguments();
@@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
  return ret_val;
 }
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
 std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
@@ -30,28 +31,16 @@ namespace paddle {
 namespace framework {
 class OpRegistry {
-  using VarNameMap = OperatorBase::VarNameMap;
-  using OpCreator = std::function<OperatorBase*(
-      const std::string& /*type*/, const VarNameMap& /*inputs*/,
-      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 public:
-  struct OpInfo {
-    OpCreator creator_;
-    std::string grad_op_type_;
-    OpProto* proto_;
-    OpAttrChecker* checker_;
-  };
  template <typename OpType, typename ProtoMakerType, typename GradOpType>
  static void RegisterOp(const std::string& op_type,
                         const std::string& grad_op_type) {
-    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                   "'%s' is registered more than once.", op_type);
    OpInfo op_info;
-    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
+    op_info.creator_ = [](
-                          const VarNameMap& outputs,
+        const std::string& type, const VariableNameMap& inputs,
-                          const AttributeMap& attrs) {
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
      return new OpType(type, inputs, outputs, attrs);
    };
    op_info.grad_op_type_ = grad_op_type;
@@ -70,7 +59,7 @@ class OpRegistry {
      op_info.proto_ = nullptr;
      op_info.checker_ = nullptr;
    }
-    op_info_map().insert(std::make_pair(op_type, op_info));
+    OpInfoMap::Instance().Insert(op_type, op_info);
    // register gradient op
    if (!grad_op_type.empty()) {
      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
@@ -78,21 +67,13 @@ class OpRegistry {
  }
  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VarNameMap& inputs,
+                                                const VariableNameMap& inputs,
-                                                const VarNameMap& outputs,
+                                                const VariableNameMap& outputs,
                                                AttributeMap attrs);
  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
-    static std::unordered_map<std::string, const OpInfo> op_info_map_;
-    return op_info_map_;
-  }
 };
 class Registrar {

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
 }
 OperatorBase::OperatorBase(const std::string& type,
-                           const OperatorBase::VarNameMap& inputs,
+                           const VariableNameMap& inputs,
-                           const OperatorBase::VarNameMap& outputs,
+                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
  static std::atomic<size_t> gUniqId(0UL);
@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
    }
    return ret_val;
  }
-  auto it = OpRegistry::op_info_map().find(type_);
+  auto& info = OpInfoMap::Instance().Get(Type());
-  PADDLE_ENFORCE(
-      it != OpRegistry::op_info_map().end(),
-      "Operator %s not registered, cannot figure out intermediate outputs",
-      type_);
-  PADDLE_ENFORCE(
-      it->second.proto_ != nullptr,
-      "Operator %s has no OpProto, cannot figure out intermediate outputs",
-      type_);
  // get all OpProto::Var for outputs
-  for (auto& o : it->second.proto_->outputs()) {
+  for (auto& o : info.Proto().outputs()) {
    // ignore all intermediate output
    if (o.intermediate()) continue;
    auto out = outputs_.find(o.name());

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
+#include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
@@ -62,10 +63,8 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
-  using VarNameMap = std::map<std::string, std::vector<std::string>>;
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
-  OperatorBase(const std::string& type, const VarNameMap& inputs,
-               const VarNameMap& outputs, const AttributeMap& attrs);
  virtual ~OperatorBase() {}
@@ -93,8 +92,8 @@ class OperatorBase {
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);
-  const VarNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Inputs() const { return inputs_; }
-  const VarNameMap& Outputs() const { return outputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
  //! Get a input which has multiple variables.
@@ -122,30 +121,32 @@ class OperatorBase {
  // I (Inputs)opear
  // O (Outputs)
  // OG (Output Gradients)
-  VarNameMap inputs_;
+  VariableNameMap inputs_;
  // NOTE: in case of OpGrad, outputs_ contains
  // IG (Inputs Gradients)
-  VarNameMap outputs_;
+  VariableNameMap outputs_;
  AttributeMap attrs_;
 };
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(CLS)                       \
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
  std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new CLS(*this)); \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
  }
 // Macro for define a default constructor for Operator.
 // You can also use
 //   using PARENT_CLASS::PARENT_CLASS;
 // to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS)                                 \
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
-  CLS(const std::string& type, const VarNameMap& inputs,                       \
+  cls(const std::string& type,                             \
-      const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \
+      const ::paddle::framework::VariableNameMap& inputs,  \
-      : PARENT_CLS(type, inputs, outputs, attrs) {}
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
 class NOP : public OperatorBase {
 public:
@@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
-  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
-                     const VarNameMap& outputs, const AttributeMap& attrs)
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void InferShape(const Scope& scope) const override {

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -23,8 +23,8 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
 public:
-  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
-                      const VarNameMap& outputs, const AttributeMap& attrs)
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
@@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) {
 class OperatorClone : public paddle::framework::OperatorBase {
 public:
  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type, const VarNameMap& inputs,
+  OperatorClone(const std::string& type,
-                const VarNameMap& outputs,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void InferShape(const paddle::framework::Scope& scope) const override {}

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -105,7 +105,10 @@ class Tensor {
  template <typename T>
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
-  platform::Place place() const { return holder_->place(); }
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    return holder_->place();
+  }
 private:
  template <typename T>

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
  }
+  PADDLE_ENFORCE(cudaStreamSynchronize(0),
+                 "cudaStreamSynchronize failed in Tensor CopyFrom");
 #endif
 }

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -21,6 +21,8 @@ if(USE_NNPACK)
  endif()
 endif()
+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@@ -42,11 +44,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
-    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
+}
+#endif
 }  // namespace paddle
--- a/paddle/function/GruFunctor.h
+++ b/paddle/function/GruFunctor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "GemmFunctor.h"
+#include "GruFunctor.h"
+#include "hl_cpu_gru.cuh"
+namespace paddle {
+template <DeviceType Device, class T>
+struct GruFunctor {
+  template <class OpResetOutput, class OpFinalOutput>
+  static void compute(OpResetOutput opResetOutput,
+                      OpFinalOutput opFinalOutput,
+                      hl_gru_value value,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   2 * frameSize,
+                                   frameSize,
+                                   1,
+                                   value.prevOutValue,
+                                   frameSize,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   value.gateValue,
+                                   frameSize * 3);
+    }
+    forward_reset_output(
+        opResetOutput, value, frameSize, batchSize, active_gate);
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   value.resetOutputValue,
+                                   frameSize,
+                                   value.stateWeight,
+                                   frameSize,
+                                   1,
+                                   value.gateValue + frameSize * 2,
+                                   frameSize * 3);
+    }
+    forward_final_output(
+        opFinalOutput, value, frameSize, batchSize, active_node);
+#endif
+  }
+};
+template <DeviceType Device, class T>
+struct GruGradFunctor {
+  template <class OpStateGrad, class OpResetGrad>
+  static void compute(OpStateGrad opStateGrad,
+                      OpResetGrad opResetGrad,
+                      hl_gru_value value,
+                      hl_gru_grad grad,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    backward_state_grad(
+        opStateGrad, value, grad, frameSize, batchSize, active_node);
+    if (value.prevOutValue && grad.prevOutGrad) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   grad.gateGrad + frameSize * 2,
+                                   frameSize * 3,
+                                   value.stateWeight,
+                                   frameSize,
+                                   0,
+                                   grad.resetOutputGrad,
+                                   frameSize);
+      if (grad.stateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize,
+                                     batchSize,
+                                     1,
+                                     value.resetOutputValue,
+                                     frameSize,
+                                     grad.gateGrad + frameSize * 2,
+                                     frameSize * 3,
+                                     1,
+                                     grad.stateWeightGrad,
+                                     frameSize);
+      }
+    }
+    backward_reset_grad(
+        opResetGrad, value, grad, frameSize, batchSize, active_gate);
+    if (grad.prevOutGrad && value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize * 2,
+                                   1,
+                                   grad.gateGrad,
+                                   frameSize * 3,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   grad.prevOutGrad,
+                                   frameSize);
+      if (grad.gateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize * 2,
+                                     batchSize,
+                                     1,
+                                     value.prevOutValue,
+                                     frameSize,
+                                     grad.gateGrad,
+                                     frameSize * 3,
+                                     1,
+                                     grad.gateWeightGrad,
+                                     frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+}  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "TensorShape.h"
 #include "TensorType.h"
+#include "neon/neon_util.h"
 namespace paddle {
@@ -93,4 +94,95 @@ public:
                  int paddingWidth);
 };
+template <class T>
+struct Padding {
+  static void run(const T* src,
+                  T* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+        memcpy(dest, src, inputWidth * sizeof(T));
+        dest += inputWidth;
+        src += inputWidth;
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+      }
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+struct Padding<float> {
+  static void run(const float* src,
+                  float* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(src);
+          vst1q_f32(dest, s0);
+          src += 4;
+          dest += 4;
+        }
+        for (int r = 0; r < remain; r++) {
+          *dest++ = *src++;
+        }
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+      }
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+#endif
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+namespace paddle {
+namespace neon {
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32(
+      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+#ifndef __aarch64__
+inline float32_t vaddvq_f32(float32x4_t a) {
+  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+template <int lane>
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+                                   float32x4_t b,
+                                   float32x4_t v) {
+  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
+}
+#endif
+}  // namespace neon
+}  // namespace paddle
+#endif
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
  } else {
    oneWaySearch(numSequences);
  }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();
  size_t size = generator_.ids.size();
  generator_.outArg.ids->resize(size);
@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
  }
  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  generator_.ids.clear();
@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
    finalPaths_[i].resize(minFinalPathsSize);
  }
-  batchMachineIdVec_.clear();
  generator_.ids.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
    real* idsProb = generator_.outArg.value->getData();
+    real* probs = generator_.outArg.in->getData();
    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
        curPos += genLen;
        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
      }
      starts[i + 1] = generator_.ids.size();
    }
  } else {
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
+      Path& path = finalPaths_[i][0];
-                            finalPaths_[i][0].ids.begin(),
+      generator_.ids.insert(
-                            finalPaths_[i][0].ids.end());
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      starts[i + 1] = starts[i] + path.ids.size();
    }
  }
 }
@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
  }
 }
-void RecurrentGradientMachine::createDataOutlink(
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    std::vector<int>& machineIdVec) {
+    bool isSeq, std::vector<Argument>& outArgs) {
-  size_t seqNum =
+  batchMachineIdVec_.clear();
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
+  size_t seqIdx = 0;
-  for (size_t i = 0; i < seqNum; ++i) {
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-                                        : finalPaths_[0][i].ids.size();
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-    starts[i + 1] = starts[i] + seqLen;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
  }
+}
+void RecurrentGradientMachine::createDataOutlink() {
  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
    dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
+                        batchMachineIdVec_,
-                        starts,
+                        batchMachineStartPos_,
+                        copySize,
                        useGpu_,
                        HPPL_STREAM_1,
                        PASS_TEST);
    auto dataAgent =
        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
    CHECK_NOTNULL(dataAgent);

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -190,7 +190,7 @@ public:
    std::vector<int> ids;
    /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
     */
    std::vector<real> idsProb;
@@ -472,15 +472,43 @@ private:
  void copyDataOutlinkFrame(size_t machineCur);
  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one are data outlinks. This function creates the data
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * outlinks.
+   * step is a separate Network, outputs of a layer inside the
-   * @note In beam search, only one generated sequence with the hightest log
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * probabilites are retained.
+   * specified as an outlink of RecurrentLayerGroup. This function will
-   * @param machineIdVec : select a row of output matrix in each frame
+   * collect outputs in each time step of each generated sequence which are
-   * that the generation process expanded.
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
   */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
  /*
   * @brief used in beam search, connect previous frame to form recurrent link
@@ -543,6 +571,7 @@ private:
  std::vector<int> topIds_;
  std::vector<int> seqIds_;
  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
  std::vector<std::vector<Path>> finalPaths_;
  std::vector<real> minFinalPathLogProb_;
  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;

--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Conv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(conv3d, Conv3DLayer);
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr outGradMat = getOutputGrad();
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -32,11 +32,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    const ConvConfig& conf = inputConfig.conv_conf();
    padding_.push_back(conf.padding());
    stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
    filterSize_.push_back(conf.filter_size());
    paddingY_.push_back(conf.padding_y());
    strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
    filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                              : conf.img_size());
@@ -45,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterChannels_.push_back(conf.filter_channels());
    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
    outputW_.push_back(conf.output_x());
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
  }
  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-  /* initialize the biases_ */
+  // create new weights_ in derived class
-  if (biasParameter_.get()) {
+  // create new biases_ in derived class
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
  // default caffe model
  caffeMode_ = true;
@@ -89,7 +79,11 @@ size_t ConvBaseLayer::calOutputSize() {
  size_t layerSize = 0;
  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
      const ConvConfig& conf = config_.inputs(i).conv_conf();
@@ -98,17 +92,17 @@ size_t ConvBaseLayer::calOutputSize() {
          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
        if (inW[i] == 0) inW[i] = conf.output_x();
        outH.push_back(imageSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(imageSize(
+        outW.push_back(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
      } else {
        if (inH[i] == 0)
          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
        if (inW[i] == 0) inW[i] = conf.img_size();
        outH.push_back(outputSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
        outW.push_back(outputSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
      }
      CHECK_EQ(outH[i], outH[0]);
      CHECK_EQ(outW[i], outW[0]);

--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -40,6 +40,10 @@ protected:
  IntV stride_;
  /// The y dimension of the stride.
  IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
  /// The x dimension of a filter kernel.
  IntV filterSize_;
  /// The y dimension of a filter kernel.
@@ -58,6 +62,13 @@ protected:
  IntV outputH_;
  /// The spatial dimensions of output feature map width.
  IntV outputW_;
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,

--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() {
                    &bwdDataAlgo_,
                    &bwdDataLimitBytes_,
                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
  size_t maxWorkSpace = 0;
  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);

--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() {
  strideH_ = conf.stride_y();
  strideW_ = conf.stride();
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
  filterH_ = conf.filter_size_y();
  filterW_ = conf.filter_size();
@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() {
                                   paddingH_,
                                   paddingW_,
                                   strideH_,
-                                   strideW_);
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
  // initialize all to default algorithms
  fwdAlgo_ = 0;
@@ -131,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
                                  paddingH_,
                                  paddingW_,
                                  strideH_,
-                                  strideW_);
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
 }
 void ConvBaseProjection::reshape(int batchSize) {
@@ -140,6 +149,10 @@ void ConvBaseProjection::reshape(int batchSize) {
  CHECK_EQ(calInputSize(), in_->value->getWidth());
  reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
+  }
  hl_conv_workspace(imageDesc_,
                    outputDesc_,
                    filterDesc_,
@@ -149,7 +162,8 @@ void ConvBaseProjection::reshape(int batchSize) {
                    &bwdDataAlgo_,
                    &bwdDataLimitBytes_,
                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    useDilation);
  size_t maxWorkSpace = 0;
  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);

--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -63,6 +63,7 @@ protected:
  int configChannels_, configNumFilters_;
  int paddingH_, paddingW_;
  int strideH_, strideW_;
+  int dilationH_, dilationW_;
  int filterH_, filterW_;
  /// One group offset of input data.
  int inputOffset_;

--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() {
  if (imageH_ == 0) imageH_ = configImgH_;
  if (imageW_ == 0) imageW_ = configImgW_;
  outputH_ = outputSize(imageH_,
-                        filterH_,
+                        (filterH_ - 1) * dilationH_ + 1,
                        paddingH_,
                        strideH_,
                        /* caffeMode */ true);
  outputW_ = outputSize(imageW_,
-                        filterW_,
+                        (filterW_ - 1) * dilationW_ + 1,
                        paddingW_,
                        strideW_,
                        /* caffeMode */ true);

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
  }
 }
-//
+bool HuberCost::init(const LayerMap& layerMap,
-// Huber loss for robust 2-classes classification
+                     const ParameterMap& parameterMap) {
-//
-REGISTER_LAYER(huber, HuberTwoClass);
-bool HuberTwoClass::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
  CostLayer::init(layerMap, parameterMap);
  if (useGpu_) {
    tmpCpuInput_.reserve(inputLayers_.size());
@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
  return true;
 }
-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(
@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
    }
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
-  forwardImpIn(output, label, cost);
 }
-void HuberTwoClass::forwardImpIn(Matrix& output,
+//
-                                 Argument& label,
+// Huber loss for robust regression.
-                                 Matrix& target) {
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
  CHECK_EQ((*label.ids).getSize(), numSamples);
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), (size_t)1);
@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,
  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples);
+  std::vector<real> cost(numSamples, 0);
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (out[i] * y < -1)
+    real a = out[i] * y;
-      cost[i] = -4 * out[i] * y;
+    if (a < -1)
-    else if (out[i] * y < 1)
+      cost[i] = -4 * a;
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
+    else if (a < 1)
-    else
+      cost[i] = (1 - a) * (1 - a);
-      cost[i] = 0;
  }
  target.copyFrom(cost.data(), numSamples);
 }
-void HuberTwoClass::backwardImp(Matrix& outputValue,
+void HuberTwoClassification::backwardImp(Matrix& output,
-                                Argument& label,
+                                         Argument& label,
-                                Matrix& outputGrad) {
+                                         Matrix& outputG) {
-  if (useGpu_) {
-    backwardImpIn(
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
-  } else {
-    backwardImpIn(outputValue, label, outputGrad);
-  }
-}
-void HuberTwoClass::backwardImpIn(Matrix& output,
-                                  Argument& label,
-                                  Matrix& outputG) {
  size_t numSamples = output.getHeight();
-  real* out = output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* grad = outputG.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  int* lbl = (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
+    if (a < -1)
      grad[i] += -4 * y;
-    else if (y * out[i] < 1)
+    else if (a < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+      grad[i] += -2 * (1 - a) * y;
  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
 /**
 * This cost layer compute the sum of its input as loss.
 * \f[

--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -304,37 +304,70 @@ public:
                   Matrix& outputGrad) override;
 };
-/**
+/*
- * Huber loss for robust 2-classes classification.
+ * A base layer for HuberRegressionLoss and HuberTwoClassification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * \f[
- * Loss =
- * \left\{\begin{matrix}
- *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
- *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
- *  0             &                    \textit{otherwise}
- * \end{matrix}\right.
- * \f]
 */
-class HuberTwoClass : public CostLayer {
+class HuberCost : public CostLayer {
+public:
  std::vector<Argument> tmpCpuInput_;
-public:
+  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
-  explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-  void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override {}
+};
+/**
+ * Huber loss for robust regression.
+ *
+ * Given output f(x), label y and delta, the loss is:
+ * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
+ * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
+ */
+class HuberRegressionLoss : public HuberCost {
+public:
+  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
  void backwardImp(Matrix& outputValue,
                   Argument& label,
                   Matrix& outputGrad) override;
-  void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+protected:
+  real delta_;
+};
+/**
+ * Huber loss for robust 2-classes classification.
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
+ * Loss = 4 * y * f, if y* f < -1 \\
+ * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
+ * Loss = 0, otherwise
+ */
+class HuberTwoClassification : public HuberCost {
+public:
+  explicit HuberTwoClassification(const LayerConfig& config)
+      : HuberCost(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 typedef std::shared_ptr<CostLayer> CostLayerPtr;

--- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "CrossEntropyOverBeam.h"
+#include "Layer.h"
+namespace paddle {
+/* This struct stores the beams in all search steps for a single sequence. */
+struct BeamExpansion {
+  std::vector<MatrixPtr> scores;
+  std::vector<IVectorPtr> seqInfo;
+  std::vector<MatrixPtr> candidateIds;
+  std::vector<int> gold;
+  std::vector<MatrixPtr> scoreGrad;
+  size_t expansionCount;
+  explicit BeamExpansion(int n) {
+    expansionCount = n;
+    scores.resize(expansionCount);
+    seqInfo.resize(expansionCount);
+    candidateIds.resize(expansionCount);
+    scoreGrad.resize(expansionCount);
+    gold.resize(expansionCount);
+  }
+};
+typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
+class CostForOneSequence {
+public:
+  CostForOneSequence()
+      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
+  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
+    beams_ = bPtr;
+    beamSize_ = beamSize;
+    expandedPathScores_.clear();
+    expandedPathScores_.resize(beams_->expansionCount);
+    goldRowIds_.clear();
+    goldRowIds_.resize(beams_->expansionCount, 0);
+    goldColIds_.clear();
+    goldColIds_.resize(beams_->expansionCount, -1);
+  }
+  size_t getValidExpansionCount() { return validExpansionCount_; }
+  real forward();
+  void backward();
+private:
+  void calValidExpandStep();
+  void constructTotalExpansion();
+  size_t initLastExpansion();
+  real globallyNormalizedScore();
+  int getSeqStartPos(size_t beamId, size_t rowId) {
+    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
+    int* starts = beams_->seqInfo[beamId]->getData();
+    return starts[rowId] - starts[0];
+  }
+  size_t beamSize_;
+  size_t validExpansionCount_;
+  bool goldAsExtraPath_;
+  std::vector<int> goldRowIds_;
+  std::vector<int> goldColIds_;
+  BeamExpansionPtr beams_;
+  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
+  std::vector<int> parentIdsInBeam_;
+  size_t goldIdsInFinalExpansion_;
+  std::vector<MatrixPtr> expandedPathScores_;
+  MatrixPtr softmaxOut_;
+};
+class CrossEntropyOverBeam : public Layer {
+public:
+  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+private:
+  void checkInputs();
+  void copyInputsToCpu();
+  void resizeOutput();
+  void copyGradToGpu(size_t copyCount);
+  void splitBatchBeams();
+  size_t beamExpanCount_;
+  size_t batchSize_;
+  size_t beamSize_;
+  /*
+   * the process of constructing beams is not friendly to GPU, currently, this
+   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
+   * it to CPU memory.
+   */
+  std::vector<MatrixPtr> candidateScores_;
+  std::vector<MatrixPtr> candidateScoreGrad_;
+  std::vector<MatrixPtr> candidateInBeam_;
+  std::vector<MatrixPtr> gradToInputs_;
+  std::vector<IVectorPtr> goldSequence_;
+  std::vector<std::vector<int>> beamSplitPos_;
+  /*
+   * split entire bath of beams into beam per sequnence and store the result
+   * into this member.
+   */
+  std::vector<BeamExpansion> beamPerSeq_;
+  /* beamCosts_ is used to propagate error in one sequence. */
+  std::vector<CostForOneSequence> beamCosts_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -46,8 +46,26 @@ bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
    projConf_.emplace_back(conf);
    projections_.emplace_back(
        Projection::create(*projConf_[i], parameters_[i], useGpu_));
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
  if (biases_.get() && sharedBiases_) {
    hl_create_tensor_descriptor(&biasDesc_);
    hl_create_tensor_descriptor(&outputDesc_);

--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ b/paddle/gserver/layers/DeConv3DLayer.h
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,12 +22,31 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
  /* Initialize the basic convolutional parent class */
  ConvBaseLayer::init(layerMap, parameterMap);
+  int index = 0;
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
-  }
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
  getOutputSize();
  return true;

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
--- a/paddle/framework/tensor_py.h
+++ b/paddle/framework/tensor_py.h
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
--- a/python/paddle/v2/framework/tests/mnist.py
+++ b/python/paddle/v2/framework/tests/mnist.py
--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
--- a/python/paddle/v2/framework/tests/test_lookup_table.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
--- a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py