Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into pad_op

Conflicts: paddle/pybind/pybind.cc

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into pad_op
Conflicts: paddle/pybind/pybind.cc
2db7dede · wanghaoshuang · 26cec839 · 9b6e063b · 2db7dede · 2db7dede
59 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_STYLE_CHECK
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 # Add bash enhancements

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/doc/about/index_cn.md
+++ b/doc/about/index_cn.md
-关于PaddlePaddle
-================
-PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
-PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
-同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
-致谢
--------
-在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
-ABOUT
-=======
-PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
-which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
-PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
-We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
-Credits
--------
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -419,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
-huber_cost
+huber_regression_cost
----------
+-------------------------
-..  autoclass:: paddle.v2.layer.huber_cost
+..  autoclass:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+huber_classification_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost

--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,14 +6,12 @@
 安装流程
 ++++++++
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+PaddlePaddle提供Docker镜像来部署环境。
 .. toctree::
   :maxdepth: 1
   docker_install_cn.rst 
-   ubuntu_install_cn.rst
 编译流程

--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -8,14 +8,13 @@ Install PaddlePaddle
    :maxdepth: 1
    docker_install_en.rst
-    ubuntu_install_en.rst
 Build from Source
 -----------------
 ..  warning::
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
 ..  toctree::
    :maxdepth: 1

--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
-Ubuntu部署PaddlePaddle
-===================================
-PaddlePaddle提供了ubuntu 14.04 deb安装包。
-安装
------
-安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
-它包含四个版本\:
-* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
-* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
-* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
-* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
-下载完相关安装包后，执行:
-..  code-block:: shell
-    sudo apt-get install gdebi
-    gdebi paddle-*-cpu.deb
-或者:
-..  code-block:: shell
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
-..  code-block:: shell
-    PaddlePaddle 0.8.0b1, compiled with
-        with_avx: ON
-        with_gpu: OFF
-        with_double: OFF
-        with_python: ON
-        with_rdma: OFF
-        with_timer: OFF
-        with_predict_sdk:
-可能遇到的问题
--------------
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-安装完成后，运行 :code:`paddle train` 报错\:
-..  code-block:: shell
-      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
-..  code-block:: shell
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_en.rst
-Debian Package installation guide
-=================================
-PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
-There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
-After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
-..	code-block:: bash
-	gdebi paddle-*.deb
-If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
-Or you can use following commands to install PaddlePaddle.
-..	code-block:: bash
-	dpkg -i paddle-*.deb
-	apt-get install -f
-And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
+# 编译PaddlePaddle和运行单元测试
+## 需要的软硬件
+为了开发PaddlePaddle，我们需要
+1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
+1. Docker。
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
+## 总体流程
+1. 获取源码
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+2. 安装开发工具到 Docker image 里
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
+3. 编译
+   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev
+   ```
+   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
+   ```bash
+   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+4. 运行单元测试
+   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
+   ```bash
+   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
+   ```bash
+   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+5. 清理
+   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
+   ```bash
+   rm -rf build
+   ```
+## 为什么要 Docker 呀？
+- 什么是 Docker?
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+- Docker 还是虚拟机？
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+- 为什么用 Docker?
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+- 我可以选择不用Docker吗？
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+- 学习 Docker 有多难？
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+- 我可以用 IDE 吗？
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+- 可以并行编译吗？
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+## 可能碰到的问题
+- Docker 需要 sudo
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+- 在 Windows/MacOS 上编译很慢
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+- 磁盘不够
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
+# Build PaddlePaddle from Source Code and Run Unit Test
+## What Developers Need
+To contribute to PaddlePaddle, you need
+1. A computer -- Linux, BSD, Windows, MacOS, and
+1. Docker.
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
+## General Process
+1. Retrieve source code.
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+2. Install build tools into a Docker image.
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
+3. Build from source.
+   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev
+   ```
+   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
+   ```bash
+   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+4. Run unit tests.
+   To run all unit tests using the first GPU of a node:
+   ```bash
+   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+   Sometimes we want to run a specific unit test, say `memory_test`, we can run
+   ```bash
+   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+5. Clean Build.
+   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
+   ```bash
+   rm -rf build
+   ```
+## Docker, Or Not?
+- What is Docker?
+  If you haven't heard of it, consider it something like Python's virtualenv.
+- Docker or virtual machine?
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+- Why Docker?
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+- Can I choose not to use Docker?
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
+- How difficult is it to learn Docker?
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+- Can I use my favorite IDE?
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+- Does Docker do parallel building?
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+## Some Gotchas
+- Docker requires sudo
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+- Docker on Windows/MacOS builds slowly
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+- Not enough disk space
+  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
+# 如何写新的Operator
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现C++类)
+   - [定义ProtoMaker类](#定义ProtoMaker类)
+   - [定义Operator类](#定义Operator类)
+   - [定义OpKernel类](#定义OpKernel类)
+   - [注册Operator](#注册Operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定Python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向Operator单测)
+   - [反向Operator单测](#反向Operator单测)
+   - [编译和执行](#编译和执行)
+## 概念简介
+简单介绍需要用到基类，详细介绍请参考设计文档。
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+ 内容            | 定义位置         
+--------------  | :----------------------  
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件 
+Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
+注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+## 实现C++类
+### 1. 定义ProtoMaker类
+矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
+```
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
+在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
+```
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+ 在这个例子里，两处不同：
+  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
+  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+### 2. 定义Operator类
+```c++
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+```c++
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+```c++
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```	
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+### 3. 定义OpKernel类
+```C++
+template <typename Place, typename T>
+class MulKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+};
+```
+`MulKernel`继承自`framework::OpKernel`，带有模板参数:
+  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+ - `typename T` : 表示数据类型，如`float`, `double`等。
+`MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
+注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
+到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
+### 4. 注册Operator
+在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+```
+  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
+  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+在 `.cu`文件中注册GPU Kernel。
+```c++
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+```
+### 5. 编译
+在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
+```
+op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+```
+下面命令可以编译：
+```
+make mul_op
+```
+## 绑定Python
+- 绑定Python 
+    在 [`paddle/pybind/pybind.cc 
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+    ```
+    USE_OP(mul);
+    ```
+    如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
+    ```
+    USE_CPU_ONLY_OP(gather);
+    ```
+    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
+ - 生成库
+   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
+   ```
+   if(WITH_PYTHON)
+     cc_library(paddle_pybind SHARED
+     SRCS pybind.cc
+     DEPS pybind python backward
+     mul_op
+     minus_op)
+   endif(WITH_PYTHON)
+   ```
+## 实现单元测试
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+### 前向Operator单测
+前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+```
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "mul"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+```
+   首先需要`import`必要的包,下面详细解释其他值：
+   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
+   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
+   - `self.outputs` : 定义输出，并得到Python结算结果。
+### 反向Operator单测
+反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
+ ```
+ class MulGradOpTest(GradientChecker):
+    def test_mul(self):
+        op = create_op("mul")
+        inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.compare_grad(op, inputs)      
+        # mul op will enlarge the relative error
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+ ```
+   - 调用`create_op("mul")`创建反向Op对应的前向Op。
+   - 定义输入`inputs`。
+   - 调用`compare_grad`函数对比CPU、GPU计算结果。
+   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
+      - 第一个参数`op` : 前向op。
+      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
+      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+### 编译和执行 
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+```
+py_test(test_mul_op SRCS test_mul_op.py)
+```
+编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+```
+make test ARGS="-R test_mul_op -V"
+```
+或者:
+```
+ctest -R test_mul_op
+```
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,6 +19,7 @@
 ..  toctree::
  :maxdepth: 1
+  dev/build_cn.rst
  dev/write_docs_cn.rst
  dev/contribute_to_paddle_cn.md

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,6 +18,7 @@ Development
 ..  toctree::
  :maxdepth: 1
+  dev/build_en.rst
  dev/new_layer_en.rst
  dev/contribute_to_paddle_en.md

--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
-  about/index_en.rst
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -124,6 +124,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    std::list<Pos> insert_position;
    for (auto& dup_output_op : dup_output_ops) {
      const std::string& name = dup_output_op.first;
+      // duplicate @Empty@ don't need to be added
+      if (name == kEmptyVarName) continue;
      auto& dup_op = dup_output_op.second;
      // no duplicate output
      if (dup_op.size() == 1) continue;
@@ -209,7 +212,7 @@ std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size());
+  no_grad_names.reserve(no_grad_vars.size() + 1);
  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);

--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
-## Operator/expression 's Backward
+# Operator/expression 's Backward
-### Motivation
+## Motivation
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+## Backward Operator Registry
-### Implement : gradient operator registry
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
-|                        | forward operator | backward operator                |
+|                        | forward operator | backward operator 
-| ---------------------- | ---------------- | -------------------------------- |
+| ---------------------- | ---------------- |------------------------- |		
-| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
-| **Operator::outputs_** | Outputs          | InputGradients                   |
+| **Operator::outputs_** | Outputs          | InputGradients            |
-Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+ In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
-We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
-grad_op_builder(fengjiayi)
+```cpp
+REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
+```
-### Implement : Backward network
+`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
+`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
+## Backward Opeartor Creating
+Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+```cpp
+OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
+``` 
+The function `BuildGradOp` will sequentially execute following processes:
+1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
+4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
+## Backward Network Building
+A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
+In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 
 given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
-1. bla bla bla (yuyang)
+1. Op 
+   when the input forward network is a Op, return its gradient Operator Immediately.
 2. NetOp 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+   <p align="center">
+   <img src="./images/duplicate_op.png" width="70%" ><br/>
+   1. shared variable in two operators. 
+   </p>
+   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+   <p align="center">
+   <img src="images/duplicate_op2.png" width="90%" ><br/>
-   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+   2. replace shared variable gradient with `Add` Operator
-   ![./images/duplicate_op]()
+   </p>
-    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
-![./images/duplicate_op2]()
-	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
+	Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -21,6 +21,8 @@ if(USE_NNPACK)
  endif()
 endif()
+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@@ -42,11 +44,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
-    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
+}
+#endif
 }  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "TensorShape.h"
 #include "TensorType.h"
+#include "neon/neon_util.h"
 namespace paddle {
@@ -93,4 +94,95 @@ public:
                  int paddingWidth);
 };
+template <class T>
+struct Padding {
+  static void run(const T* src,
+                  T* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+        memcpy(dest, src, inputWidth * sizeof(T));
+        dest += inputWidth;
+        src += inputWidth;
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+      }
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+struct Padding<float> {
+  static void run(const float* src,
+                  float* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(src);
+          vst1q_f32(dest, s0);
+          src += 4;
+          dest += 4;
+        }
+        for (int r = 0; r < remain; r++) {
+          *dest++ = *src++;
+        }
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+      }
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+#endif
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+namespace paddle {
+namespace neon {
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32(
+      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+#ifndef __aarch64__
+inline float32_t vaddvq_f32(float32x4_t a) {
+  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+                                   float32x4_t b,
+                                   float32x4_t v,
+                                   const int lane) {
+  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
+}
+#endif
+}  // namespace neon
+}  // namespace paddle
+#endif
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
  }
 }
-//
+bool HuberCost::init(const LayerMap& layerMap,
-// Huber loss for robust 2-classes classification
+                     const ParameterMap& parameterMap) {
-//
-REGISTER_LAYER(huber, HuberTwoClass);
-bool HuberTwoClass::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
  CostLayer::init(layerMap, parameterMap);
  if (useGpu_) {
    tmpCpuInput_.reserve(inputLayers_.size());
@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
  return true;
 }
-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(
@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
    }
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
-  forwardImpIn(output, label, cost);
 }
-void HuberTwoClass::forwardImpIn(Matrix& output,
+//
-                                 Argument& label,
+// Huber loss for robust regression.
-                                 Matrix& target) {
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
  CHECK_EQ((*label.ids).getSize(), numSamples);
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), (size_t)1);
@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,
  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples);
+  std::vector<real> cost(numSamples, 0);
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (out[i] * y < -1)
+    real a = out[i] * y;
-      cost[i] = -4 * out[i] * y;
+    if (a < -1)
-    else if (out[i] * y < 1)
+      cost[i] = -4 * a;
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
+    else if (a < 1)
-    else
+      cost[i] = (1 - a) * (1 - a);
-      cost[i] = 0;
  }
  target.copyFrom(cost.data(), numSamples);
 }
-void HuberTwoClass::backwardImp(Matrix& outputValue,
+void HuberTwoClassification::backwardImp(Matrix& output,
-                                Argument& label,
+                                         Argument& label,
-                                Matrix& outputGrad) {
+                                         Matrix& outputG) {
-  if (useGpu_) {
-    backwardImpIn(
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
-  } else {
-    backwardImpIn(outputValue, label, outputGrad);
-  }
-}
-void HuberTwoClass::backwardImpIn(Matrix& output,
-                                  Argument& label,
-                                  Matrix& outputG) {
  size_t numSamples = output.getHeight();
-  real* out = output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* grad = outputG.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  int* lbl = (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
+    if (a < -1)
      grad[i] += -4 * y;
-    else if (y * out[i] < 1)
+    else if (a < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+      grad[i] += -2 * (1 - a) * y;
  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
 /**
 * This cost layer compute the sum of its input as loss.
 * \f[

--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -304,37 +304,70 @@ public:
                   Matrix& outputGrad) override;
 };
-/**
+/*
- * Huber loss for robust 2-classes classification.
+ * A base layer for HuberRegressionLoss and HuberTwoClassification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * \f[
- * Loss =
- * \left\{\begin{matrix}
- *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
- *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
- *  0             &                    \textit{otherwise}
- * \end{matrix}\right.
- * \f]
 */
-class HuberTwoClass : public CostLayer {
+class HuberCost : public CostLayer {
+public:
  std::vector<Argument> tmpCpuInput_;
-public:
+  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
-  explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-  void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override {}
+};
+/**
+ * Huber loss for robust regression.
+ *
+ * Given output f(x), label y and delta, the loss is:
+ * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
+ * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
+ */
+class HuberRegressionLoss : public HuberCost {
+public:
+  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
  void backwardImp(Matrix& outputValue,
                   Argument& label,
                   Matrix& outputGrad) override;
-  void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+protected:
+  real delta_;
+};
+/**
+ * Huber loss for robust 2-classes classification.
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
+ * Loss = 4 * y * f, if y* f < -1 \\
+ * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
+ * Loss = 0, otherwise
+ */
+class HuberTwoClassification : public HuberCost {
+public:
+  explicit HuberTwoClassification(const LayerConfig& config)
+      : HuberCost(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 typedef std::shared_ptr<CostLayer> CostLayerPtr;

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -29,6 +29,10 @@ namespace paddle {
 REGISTER_LAYER(exconv, ExpandConvLayer);
 REGISTER_LAYER(exconvt, ExpandConvLayer);
+inline bool isDepthwiseConv(int channels, int groups) {
+  return channels == groups;
+}
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
@@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+    // Convolution Layer uses the GemmConv function by default.
+    convType = "GemmConv";
+    convGradInputType = "GemmConvGradInput";
+    convGradFilterType = "GemmConvGradFilter";
+    // If depth wise convolution and useGpu == true
+    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
      convType = "DepthwiseConv";
      convGradInputType = "DepthwiseConvGradInput";
      convGradFilterType = "DepthwiseConvGradFilter";
-    } else {
+    }
-      convType = "GemmConv";
-      convGradInputType = "GemmConvGradInput";
+    // If depth wise convolution and useGpu == false and ARM-NEON
-      convGradFilterType = "GemmConvGradFilter";
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      if ((filterSize_[i] == filterSizeY_[i]) &&
+          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+        convType = "NeonDepthwiseConv";
+      }
+#endif
    }
    if (FLAGS_use_nnpack && !isDeconv_) {

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -41,7 +41,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
    : config_(config),
      useGpu_(useGpu),
-      deviceId_(-1),
+      deviceId_(CPU_DEVICE),
      needSequenceInfo_(true) {}
 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {

--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -59,7 +59,12 @@ protected:
  LayerConfig config_;
  /// whether to use GPU
  bool useGpu_;
-  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  /// Paddle device ID, MKLDNN is -2, CPU is -1
+  enum PADDLE_DEVICE_ID {
+    MKLDNN_DEVICE = -2,
+    CPU_DEVICE = -1,
+  };
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
  int deviceId_;
  /// Input layers
  std::vector<LayerPtr> inputLayers_;
@@ -77,6 +82,7 @@ protected:
  Argument output_;
  /// Several outputs stored on different devices, used in 'parallel_nn' case,
  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
  std::map<std::string, Argument*> outputMap_;
@@ -172,6 +178,13 @@ protected:
    return inputLayer.getOutput(deviceId_);
  }
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
  /**
   * Get the forward-input value.
   */
@@ -186,6 +199,13 @@ protected:
    return inputLayer.getOutput(deviceId_).value;
  }
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
  /**
   * Get the forward-input grad.
   */
@@ -200,6 +220,13 @@ protected:
    return inputLayer.getOutput(deviceId_).grad;
  }
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
  /**
   * Get the forward-input label.
   */

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
    return;
  }
-  // TODO(TJ): dst format should get from wgtVal_
+  CHECK(wgtVal_) << "should have been initialized";
-  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
-  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
+  auto targetDim = wgtVal_->getDims();
-  if (srcFmt == dstFmt) {
+  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
-    return;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  }
-  // The weight_ is transposed from initial paddle weight
-  MatrixPtr paddleWgt = Matrix::create(
-      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
-  // TODO(TJ): remove this print when do not need differ weights
-  std::ostringstream ostr;
-  paddleWgt->print(ostr);
-  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
-  // The mkldnn weight is transposed from initial paddle matrix
-  MatrixPtr paddleWgtT;
-  paddleWgt->transpose(paddleWgtT, true);
-  weight_->getW()->copyFrom(*paddleWgtT);
-  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
  hasInitedWgt_ = true;
 }
 void MKLDNNFcLayer::convertWeightsToPaddle() {
-  MatrixPtr dnnWgt = weight_->getW();
+  CHECK(wgtVal_) << "should have been initialized";
-  MatrixPtr paddleWgt;
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
-  dnnWgt->transpose(paddleWgt, true);
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
-  // copy paddle weight and override on weight_
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-  MatrixPtr dnnWgtT = Matrix::create(
+}
-      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
-  dnnWgtT->copyFrom(*paddleWgt);
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
 }
 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
  if (bs_ == batchSize) {
    return;
@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
  if (iw_ == 0) {
    iw_ = 1;
  }
-  hasSpatial_ = true;
-  if (ih_ == 1 && iw_ == 1) {
-    hasSpatial_ = false;
-  }
  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
  ic_ = iLayerSize_ / (ih_ * iw_);
  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
 void MKLDNNFcLayer::resetFwd() {
  bool hasBias = biases_ && biases_->getW();
-  real* iData = getInputValue(0)->getData();
+  const MatrixPtr& wgt = weight_->getW();
-  real* oData = getOutputValue()->getData();
+  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
-  real* wData = weight_->getW()->getData();
+  const MatrixPtr& out = output_.value;
-  real* bData = hasBias ? biases_->getW()->getData() : NULL;
+  if (inputIsOnlyMKLDNN()) {
-  // TODO(TJ): below create should be covered in MkldnnMatrix
+    const MatrixPtr& in = getInputValue(0);
-  // create memory desc
+    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+    CHECK(inVal_) << "Input should be MKLDNNMatrix";
-                                 : createMD({bs_, ic_}, format::nc);
+  } else {
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-                                 : createMD({oc_, ic_}, format::oi);
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
-  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
+    inVal_ = MKLDNNMatrix::create(
-                                   : createMD({}, format::format_undef);
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  }
+  inVal_->downSpatial();
-  // create memory primitive desc and memory self
+  wgtVal_ = MKLDNNMatrix::create(
-  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
-  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
+  wgtVal_->downSpatial();
-  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+  biasVal_ =
+      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+  outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+  // change original output value to mkldnn output value
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!outputIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
+  }
+  // create forward handle
  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
-                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  if (hasBias) {
-  if (bData != NULL) {
-    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
  } else {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
  }
+  printValueFormatFlow();
  pipelineFwd_.clear();
  pipelineFwd_.push_back(*fwd_);
 }
@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
    return;
  }
  needResetBwd_ = false;
  bool hasBias = biases_ && biases_->getWGrad();
-  real* iData = getInputValue(0)->getData();
-  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
-  real* oDiff = getOutputGrad()->getData();
-  real* wDiff = weight_->getWGrad()->getData();
-  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
  /// backward weight
-  // create memory desc for backward memory
+  CHECK(inVal_) << "Should have input value";
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+  const MatrixPtr& wgt = weight_->getWGrad();
-                                 : createMD({bs_, ic_}, format::nc);
+  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
+  // TODO(TJ): merge outgrad
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
+  // for MKLDNN device:
-                                   : createMD({}, format::format_undef);
+  // can not directly cast outputgrad to mkldnnmatrix,
+  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  if (inVal_) {
+  // So just create from matrix with outputvalue format.
-    // update data
+  // for CPU device:
-    inVal_->set_data_handle(iData);
+  // fc do not need to convert from cpu device since output is always nc format
-  } else {
+  // only need create from cpu device
-    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  const MatrixPtr& out = getOutput(device).grad;
-  }
+  outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
-  // create memory primitive desc and memory self
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
-  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
+                      : nullptr;
-  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
+  // create memory primitive desc
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
-                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
  fc_bwdWgt::primitive_desc bwdWgtPD =
      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
-  if (bDiff != NULL) {
+  if (hasBias) {
-    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
    bwdWgt_.reset(
        new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
  } else {
@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);
  /// backward data
-  if (iDiff == NULL) {
+  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
    return;
  }
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  } else {
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
+  }
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
  fc_bwdData::primitive_desc bwdDataPD =
      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
  CHECK(wgtVal_) << "Should have weight memory";
  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
  pipelineBwd_.push_back(*bwdData_);
 }
@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
  {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    syncInputValue();
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->set_data_handle(iData);
    // just submit forward pipeline
    stream_->submit(pipelineFwd_);
@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
    resetBwd();
-    // update diff
+    syncOutputGrad();
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->set_data_handle(oDiff);
    // just sumbmit backward pipeline
    stream_->submit(pipelineBwd_);
  }

--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -32,16 +32,13 @@ protected:
  // if has already init the weight
  bool hasInitedWgt_;
-  // if input layer has image size info (ih>1 && iw>1)
-  bool hasSpatial_;
  // fc weight and bias
  std::unique_ptr<Weight> weight_;
  std::unique_ptr<Weight> biases_;
 public:
  explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}
  ~MKLDNNFcLayer() {}
@@ -75,6 +72,8 @@ protected:
   * only would be called when needed
   */
  void resetBwd();
+  void convertOutputToOtherDevice() override;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "Layer.h"
 #include "MKLDNNBase.h"
 #include "mkldnn.hpp"
+#include "paddle/math/MKLDNNMatrix.h"
 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);
 namespace paddle {
@@ -52,15 +52,15 @@ protected:
  std::vector<mkldnn::primitive> pipelineFwd_;
  std::vector<mkldnn::primitive> pipelineBwd_;
-  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
+  // MKLDNNMatrixPtr
-  std::shared_ptr<mkldnn::memory> inVal_;
+  MKLDNNMatrixPtr inVal_;
-  std::shared_ptr<mkldnn::memory> inGrad_;
+  MKLDNNMatrixPtr inGrad_;
-  std::shared_ptr<mkldnn::memory> outVal_;
+  MKLDNNMatrixPtr outVal_;
-  std::shared_ptr<mkldnn::memory> outGrad_;
+  MKLDNNMatrixPtr outGrad_;
-  std::shared_ptr<mkldnn::memory> wgtVal_;
+  MKLDNNMatrixPtr wgtVal_;
-  std::shared_ptr<mkldnn::memory> wgtGrad_;
+  MKLDNNMatrixPtr wgtGrad_;
-  std::shared_ptr<mkldnn::memory> biasVal_;
+  MKLDNNMatrixPtr biasVal_;
-  std::shared_ptr<mkldnn::memory> biasGrad_;
+  MKLDNNMatrixPtr biasGrad_;
 public:
  explicit MKLDNNLayer(const LayerConfig& config)
@@ -83,17 +83,21 @@ public:
  virtual bool init(const LayerMap& layerMap,
                    const ParameterMap& parameterMap) {
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    CHECK(!useGpu_) << "Do not support GPU yet";
+    // set device id before Layer::init
+    setDevice(MKLDNN_DEVICE);
+    // change param device to MKLDNN device
+    setParamsDevice(MKLDNN_DEVICE, parameterMap);
    if (!Layer::init(layerMap, parameterMap)) {
      return false;
    }
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
    stream_.reset(new MKLDNNStream());
    engine_ = CPUEngine::Instance().getEngine();
-    // TODO(TJ): deivecId
    return true;
  }
@@ -109,6 +113,12 @@ public:
   */
  virtual void convertWeightsToPaddle() {}
+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
  /**
   * print info about sizes
   */
@@ -118,14 +128,124 @@ public:
                       << ", oh: " << oh_ << ", ow: " << ow_;
  }
-  // TODO(TJ): move to MkldnnMatrix
+  /**
-  // create memory desc
+   * Print the mkldnn memory format flow of value
-  inline mkldnn::memory::desc createMD(
+   */
-      mkldnn::memory::dims dims,
+  virtual void printValueFormatFlow() {
-      mkldnn::memory::format fmt,
+    if (inVal_ && outVal_) {
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
-    // TODO(TJ): isFmtSuppoted(fmt)
+                        << " >>> " << outVal_->getFormat();
-    return mkldnn::memory::desc(dims, type, fmt);
+    }
+  }
+  /**
+   * Print the mkldnn memory format flow of grad
+   */
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
+  }
+protected:
+  /**
+   * copy image size and sequence info to other device
+   * @note: can not directly use Layer::copyOutputToOtherDevice since here only
+   *        copy base info and do not copy data value
+   */
+  void copyOutputInfoToOtherDevice() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (inputIsOnlyMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
  }
 };

--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -130,6 +130,8 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
  CHECK(starts || ends) << "At least one of the start or end indices "
                        << "should be given.";
+  bool hasSubseq = getInput(0).hasSubseq();
  outSeqStartPos_.resize(1, 0);
  outSubSeqStartPos_.resize(1, 0);
  selectedRows_.clear();
@@ -151,14 +153,13 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
        int seqLen = endPos - begPos + 1;
        CHECK_GT(seqLen, 0U);
        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        inputSeqInfoVec_.size() > 1
+        hasSubseq
            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
      }
      rowIdx++;
    }
-    if (inputSeqInfoVec_.size() > 1)
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-      outSeqStartPos_.push_back(outSubSeqStartPos_.back());
  }
  if (useGpu_) {
@@ -175,7 +176,7 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
  output_.sequenceStartPositions->copyFrom(
      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-  if (inputSeqInfoVec_.size() > 1) {
+  if (hasSubseq) {
    ICpuGpuVector::resizeOrCreate(
        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
    output_.subSequenceStartPositions->copyFrom(
@@ -204,10 +205,11 @@ void SequenceSliceLayer::forward(PassType passType) {
    copySliceIdsToCpu();
  }
-  // calculate the selected row indices in a batch,
+  /*
-  // and build the output sequence information.
+   * calculate the selected row indices in a batch, and build the output
-  calSelectedRows(startIdsOnCpu_ ? startIdsOnCpu_ : nullptr,
+   * sequence information.
-                  endIdsOnCpu_ ? endIdsOnCpu_ : nullptr);
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
  resetOutput(selectedRows_.size(), getSize());

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -850,9 +850,27 @@ TEST(Layer, square_error_weighted) {
  }
 }
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
 TEST(Layer, huber_two_class) {
  TestConfig config;
-  config.layerConfig.set_type("huber");
+  config.layerConfig.set_type("huber_classification");
  config.biasSize = 0;
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
@@ -861,7 +879,7 @@ TEST(Layer, huber_two_class) {
  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
  }
 }

--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -30,6 +30,8 @@ const int MAX_SEQ_NUM = 17;
 const int MAX_SEQ_LEN = 23;
 const int MAX_BEAM_SIZE = 13;
+const size_t SEED = (size_t)(time(NULL));
 vector<real> randSampling(real range, int n) {
  CHECK_GE(range, n);
  vector<real> num(range);
@@ -46,7 +48,7 @@ void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
  seqStartPos.resize(1, 0);
  subSeqStartPos.resize(1, 0);
-  srand((size_t)(time(NULL)));
+  srand(SEED);
  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
  for (int i = 0; i < seqNum; ++i) {
    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);

--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
   */
  virtual void* alloc(size_t size) {
    void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
    return ptr;
  }

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -14,6 +14,17 @@
 #
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
 set(MATH_SOURCES
    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNMatrix.h"
+using namespace mkldnn;  // NOLINT
+namespace paddle {
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
+  CHECK(ndims > 0) << "Input dims should not be empty";
+  size_t cnts = 1;
+  for (size_t i = 0; i < ndims; ++i) {
+    cnts *= dims[i];
+  }
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    m = Matrix::create(height, width, false, false);
+  }
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
+}
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+}
+void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
+                                   memory::format srcFmt,
+                                   memory::dims targetDim) {
+  memory::format dstFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
+                                 memory::format dstFmt,
+                                 memory::dims targetDim) {
+  memory::format srcFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+void MKLDNNMatrix::reorderOnce(void* srcData,
+                               void* dstData,
+                               memory::format srcFmt,
+                               memory::format dstFmt,
+                               memory::dims dm) {
+  CHECK(srcData);
+  CHECK(dstData);
+  MatrixPtr tmpSrc;
+  if (dstData == srcData) {
+    // inplace data
+    size_t sz = 1;
+    for (size_t i = 0; i < dm.size(); ++i) {
+      sz *= dm[i];
+    }
+    tmpSrc = Matrix::create(sz, 1, false, false);
+    tmpSrc->copyFrom((real*)srcData, sz);
+    srcData = tmpSrc->getData();
+  }
+  auto dtype = this->getDtype();
+  auto srcMD = memory::desc(dm, dtype, srcFmt);
+  auto dstMD = memory::desc(dm, dtype, dstFmt);
+  auto eg = this->getEngine();
+  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
+  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
+  auto r = reorder(src, dst);
+  stream(stream::kind::eager).submit({r}).wait();
+}
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
+  const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  mkldnn_primitive_t result;
+  mkldnn::error::wrap_c_api(
+      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+      "could not create a memory primitive");
+  reset(result);
+  set_data_handle(getData());
+}
+}  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/parameter/Parameter.h"
+namespace paddle {
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+public:
+  MKLDNNMatrix(real* data,
+               size_t height,
+               size_t width,
+               mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+  ~MKLDNNMatrix() {}
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      MatrixPtr m,
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+};
+}  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -47,6 +47,7 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 op_library(gather_op SRCS gather_op.cc gather_op.cu)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+op_library(scatter_op SRCS scatter_op.cc scatter_op.cu)
 cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)

--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -31,10 +31,13 @@ void NetOp::CompleteAddOp(bool calc) {
  for (auto& op : ops_) {
    for (auto& ipt : op->Inputs()) {
      for (auto& var_name : ipt.second) {
-        if (!Contains(output_set, var_name)) {  // Not other op's output
+        // If input variable has been in output set, then it will be
-          input_set.insert(var_name);
+        // added into intermediate_outputs_. Otherwise, it will be
-        } else {
+        // added into input set.
+        if (Contains(output_set, var_name)) {
          intermediate_outputs_.insert(var_name);
+        } else {
+          input_set.insert(var_name);
        }
      }
    }

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/scatter_op.h"
+#include "paddle/framework/ddim.h"
+namespace paddle {
+namespace operators {
+class ScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
+                      "Update Index should be 1-D.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
+                      ctx.Input<Tensor>("Updates")->dims().size(),
+                      "Reference and Updates should have the same shape size");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Updates")->dims()[0],
+                      ctx.Input<Tensor>("Index")->dims()[0],
+                      "Updates and Index should have same batch-size.");
+    framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
+    for (int i = 1; i < data_dim.size(); ++i)
+      PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
+    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("Ref")->dims());
+  }
+};
+class ScatterGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    dRef->Resize(Ref->dims());
+    dUpdates->Resize(Updates->dims());
+  }
+};
+class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScatterOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ref", "The source input of scatter op");
+    AddInput("Index",
+             "The index input of scatter op where Ref will be updated");
+    AddInput("Updates", "The updated value of updates op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Scatter Operator by selecting from the first axis, 
+Out = Ref
+Out[Index] = Ref[Index] + Updates
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+            ops::ScatterGradOp);
+REGISTER_OP_CPU_KERNEL(scatter,
+                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    scatter_grad,
+    ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/scatter_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(scatter,
+                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+class ScatterOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+    // In place output: Out = Ref, Out[Index] += Updates
+    Out->ShareDataWith<T>(*Ref);
+    // Apply ScatterUpdate: Out[index] += Updates[:]
+    ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
+  }
+};
+template <typename Place, typename T>
+class ScatterGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith<T>(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates += dO[Index]
+    Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -281,7 +281,11 @@ public:
  /**
   * @brief Set the format in header.
   */
-  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
  /**
   * @brief  Parameter Update Hook.

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -4,6 +4,7 @@ cc_library(paddle_pybind SHARED
    DEPS pybind python backward
    sgd_op
    gather_op
+    scatter_op
    add_op
    mul_op
    rowwise_add_op

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -48,6 +48,7 @@ USE_OP_ITSELF(identity);
 USE_OP(minus);
 USE_CPU_ONLY_OP(gather);
 USE_OP(pad);
+USE_CPU_ONLY_OP(scatter);
 namespace paddle {
 namespace framework {

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -38,7 +38,7 @@ Configuring cmake in /paddle/build ...
      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
      -DCUDNN_ROOT=/usr/
      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-      -DWITH_TESTING=${WITH_TESTING:-OFF}
+      -DWITH_TESTING=${WITH_TESTING:-ON}
      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================
 EOF
@@ -56,19 +56,18 @@ cmake .. \
      -DWITH_C_API=${WITH_C_API:-OFF} \
      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-      -DWITH_TESTING=${WITH_TESTING:-OFF} \
+      -DWITH_TESTING=${WITH_TESTING:-ON} \
      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 cat <<EOF
 ============================================
 Building in /paddle/build ...
-   Build unit tests: ${WITH_TESTING:-OFF}
 ============================================
 EOF
 make -j `nproc`
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
+if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
 cat <<EOF
 ========================================
 Running unit tests ...

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -499,6 +499,9 @@ message LayerConfig {
  optional int32 axis = 54 [ default = 2 ];
  repeated uint32 offset = 55;
  repeated uint32 shape = 56;
+  // for HuberRegressionLoss
+  optional double delta = 57 [ default = 1.0 ];
 }
 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2274,7 +2274,7 @@ define_cost('PnpairValidation', 'pnpair-validation')
 define_cost('SumOfSquaresCostLayer', 'square_error')
 define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
-define_cost('HuberTwoClass', 'huber')
+define_cost('HuberTwoClassification', 'huber_classification')
 define_cost('SumCost', 'sum_cost')
 define_cost('SmoothL1Cost', 'smooth_l1')
@@ -2336,6 +2336,17 @@ class LambdaCost(LayerBase):
        self.config.max_sort_size = max_sort_size
+@config_layer('huber_regression')
+class HuberRegressionLoss(LayerBase):
+    def __init__(self, name, inputs, delta=1., coeff=1., device=None):
+        super(HuberRegressionLoss, self).__init__(
+            name, 'huber_regression', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
+        self.config.delta = delta
+        self.config.coeff = coeff
 @config_layer('nce')
 class NCELayer(LayerBase):
    def __init__(self,

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -110,7 +110,8 @@ __all__ = [
    'sum_cost',
    'rank_cost',
    'lambda_cost',
-    'huber_cost',
+    'huber_regression_cost',
+    'huber_classification_cost',
    'block_expand_layer',
    'maxout_layer',
    'out_prod_layer',
@@ -220,7 +221,8 @@ class LayerType(object):
    RANK_COST = 'rank-cost'
    LAMBDA_COST = 'lambda_cost'
-    HUBER = 'huber'
+    HUBER_REGRESSION = 'huber_regression'
+    HUBER_CLASSIFICATION = 'huber_classification'
    CROSS_ENTROPY = 'multi-class-cross-entropy'
    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
@@ -5644,16 +5646,77 @@ def sum_cost(input, name=None, layer_attr=None):
 @wrap_name_default()
 @layer_support()
-def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
+def huber_regression_cost(input,
+                          label,
+                          name=None,
+                          delta=1.0,
+                          coeff=1.0,
+                          layer_attr=None):
+    """
+    In statistics, the Huber loss is a loss function used in robust regression, 
+    that is less sensitive to outliers in data than the squared error loss. 
+    Given a prediction f(x), a label y and :math:`\delta`, the loss function 
+    is defined as:
+    .. math:
+       loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
+       loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
+    The example usage is:
+    .. code-block:: python
+       cost = huber_regression_cost(input=input_layer, label=label_layer)
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param label: The input label.
+    :type input: LayerOutput.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param delta: The difference between the observed and predicted values.
+    :type delta: float.
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput.
+    """
+    assert isinstance(input, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.HUBER_REGRESSION,
+        inputs=[input.name, label.name],
+        delta=delta,
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
+@wrap_name_default()
+@layer_support()
+def huber_classification_cost(input,
+                              label,
+                              name=None,
+                              coeff=1.0,
+                              layer_attr=None):
    """
-    A loss layer for huber loss.
+    For classification purposes, a variant of the Huber loss called modified Huber 
+    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and 
+    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber 
+    loss is defined as:
+    .. math:
+       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1 
+       loss = -4yf(x), \text{otherwise}
    The example usage is:
    .. code-block:: python
-       cost = huber_cost(input=input_layer,
+       cost = huber_classification_cost(input=input_layer, label=label_layer)
-                         label=label_layer)
    :param input: The first input layer.
    :type input: LayerOutput.
@@ -5673,11 +5736,12 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
        assert input.size == 1
    Layer(
        name=name,
-        type=LayerType.HUBER,
+        type=LayerType.HUBER_CLASSIFICATION,
        inputs=[input.name, label.name],
        coeff=coeff,
        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1)
+    return LayerOutput(
+        name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
 @wrap_name_default()

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -167,6 +167,20 @@ layers {
  softmax_selfnorm_alpha: 0.1
  coeff: 1.0
 }
+layers {
+  name: "__huber_regression_cost_0__"
+  type: "huber_regression"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  coeff: 1.0
+  delta: 1.0
+}
 layers {
  name: "huber_probs"
  type: "data"
@@ -180,8 +194,8 @@ layers {
  active_type: ""
 }
 layers {
-  name: "__huber_cost_0__"
+  name: "__huber_classification_cost_0__"
-  type: "huber"
+  type: "huber_classification"
  size: 1
  active_type: ""
  inputs {
@@ -300,7 +314,8 @@ output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
 output_layer_names: "__cross_entropy_0__"
 output_layer_names: "__cross_entropy_with_selfnorm_0__"
-output_layer_names: "__huber_cost_0__"
+output_layer_names: "__huber_regression_cost_0__"
+output_layer_names: "__huber_classification_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
 output_layer_names: "__nce_layer_0__"
@@ -324,9 +339,10 @@ sub_models {
  layer_names: "__lambda_cost_0__"
  layer_names: "__cross_entropy_0__"
  layer_names: "__cross_entropy_with_selfnorm_0__"
+  layer_names: "__huber_regression_cost_0__"
  layer_names: "huber_probs"
  layer_names: "huber_label"
-  layer_names: "__huber_cost_0__"
+  layer_names: "__huber_classification_cost_0__"
  layer_names: "__multi_binary_label_cross_entropy_0__"
  layer_names: "__sum_cost_0__"
  layer_names: "__nce_layer_0__"
@@ -349,7 +365,8 @@ sub_models {
  output_layer_names: "__lambda_cost_0__"
  output_layer_names: "__cross_entropy_0__"
  output_layer_names: "__cross_entropy_with_selfnorm_0__"
-  output_layer_names: "__huber_cost_0__"
+  output_layer_names: "__huber_regression_cost_0__"
+  output_layer_names: "__huber_classification_cost_0__"
  output_layer_names: "__multi_binary_label_cross_entropy_0__"
  output_layer_names: "__sum_cost_0__"
  output_layer_names: "__nce_layer_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -33,7 +33,9 @@ outputs(
        input=probs, label=xe_label),
    cross_entropy_with_selfnorm(
        input=probs, label=xe_label),
-    huber_cost(
+    huber_regression_cost(
+        input=seq_in, label=labels),
+    huber_classification_cost(
        input=data_layer(
            name='huber_probs', size=1),
        label=data_layer(

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -14,6 +14,7 @@ py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
 py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
 py_test(test_gather_op SRCS test_gather_op.py)
+py_test(test_scatter_op SRCS test_scatter_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 py_test(gradient_checker SRCS gradient_checker.py)

--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -32,7 +32,8 @@ def get_numeric_gradient(op,
                         output_name,
                         input_to_check,
                         delta=0.005,
-                         local_scope=None):
+                         local_scope=None,
+                         in_place=False):
    """
    Get Numeric Gradient for an operator's input.
@@ -81,6 +82,11 @@ def get_numeric_gradient(op,
    def product(dim):
        return reduce(lambda a, b: a * b, dim, 1)
+    def restore_inputs():
+        for var_name in input_values:
+            tensor_ = local_scope.find_var(var_name).get_tensor()
+            tensor_.set(numpy.copy(input_values[var_name]), core.CPUPlace())
    # get the input tensor that we want to get it's numeric gradient.
    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.get_dims())
@@ -90,6 +96,8 @@ def get_numeric_gradient(op,
    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
    for i in xrange(tensor_size):
+        if in_place:
+            restore_inputs()
        # get one input element throw it's index i.
        origin = tensor_to_check.get_float_element(i)
@@ -99,6 +107,8 @@ def get_numeric_gradient(op,
        y_pos = get_output()
        # plus delta to this element, run op and get the sum of the result tensor.
+        if in_place:
+            restore_inputs()
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()
@@ -251,13 +261,14 @@ class GradientChecker(unittest.TestCase):
                   output_name,
                   no_grad_set=None,
                   only_cpu=False,
+                   in_place=False,
                   max_relative_error=0.005):
        """
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
            computation will use these variables.
        :param inputs_to_check: inputs var names that should check gradient.
-        :param output_name: output name that used to
+        :param output_name: the output variable name of forward network.
        :param max_relative_error: The relative tolerance parameter.
        :param no_grad_set: used when create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
@@ -283,7 +294,8 @@ class GradientChecker(unittest.TestCase):
        # get numerical gradients
        numeric_grads = [
-            get_numeric_gradient(forward_op, input_vars, output_name, name)
+            get_numeric_gradient(
+                forward_op, input_vars, output_name, name, in_place=in_place)
            for name in inputs_to_check
        ]

--- a/python/paddle/v2/framework/tests/mnist.py
+++ b/python/paddle/v2/framework/tests/mnist.py
@@ -181,7 +181,7 @@ images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
 labels = data_layer(name='label', dims=[BATCH_SIZE])
 fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
 fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
-predict = fc_layer(net=forward_net, input=fc2, size=100, act="softmax")
+predict = fc_layer(net=forward_net, input=fc2, size=10, act="softmax")
 cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
 init_net.complete_add_op(True)

--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
@@ -21,12 +21,9 @@ class TestGatherOp(unittest.TestCase):
 class TestGatherGradOp(GradientChecker):
    def test_gather_grad(self):
-        print 'creating op'
        op = create_op("gather")
-        print 'creating op done'
        xnp = numpy.random.random((10, 20)).astype("float32")
        inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
-        print 'correct before check gradient'
        self.check_grad(op, inputs, set("X"), "Out")

--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+class TestScatterOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "scatter"
+        ref_np = numpy.ones((3, 3)).astype("float32")
+        index_np = numpy.array([1, 2]).astype("int32")
+        updates_np = numpy.random.random((2, 3)).astype("float32")
+        output_np = numpy.copy(ref_np)
+        output_np[index_np] += updates_np
+        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+class TestScatterGradOp(GradientChecker):
+    def test_scatter_grad(self):
+        op = create_op("scatter")
+        # test data setup
+        ref_np = numpy.ones((3, 10)).astype("float32")
+        index_np = numpy.array([1, 2]).astype("int32")
+        updates_np = numpy.random.random((2, 10)).astype("float32")
+        output_np = numpy.copy(ref_np)
+        output_np[index_np] += updates_np
+        inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.check_grad(
+            op, inputs, set(["Updates", "Ref"]), "Out", in_place=True)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -14,6 +14,7 @@
 import numpy as np
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from collections import OrderedDict
 import paddle.trainer.config_parser as cp
 import struct
 import tarfile
@@ -42,9 +43,25 @@ def create(layers):
 class Parameters(object):
    """
-    Parameters is a dictionary contains Paddle's parameter. The key of
+    `Parameters` manages all the learnable parameters in a neural network.
-    Parameters is the name of parameter. The value of Parameters is a plain
+    It stores parameters' information in an OrderedDict. The key is
-    :code:`numpy.ndarry` .
+    the name of a parameter, and value is a parameter's configuration(in
+    protobuf format), such as initialization mean and std, its size, whether it
+    is a static parameter, and so on.
+    :param __param_conf__: store the configurations of learnable parameters in
+        the network in an OrderedDict. Parameter is added one by one into the
+        dict by following their created order in the network: parameters of
+        the previous layers in a network are careted first. You can visit the
+        parameters from bottom to top by iterating over this dict.
+    :type __param_conf__: OrderedDict
+    :param __gradient_machines__: all of the parameters in a neural network are
+        appended to a PaddlePaddle gradient machine, which is used internally to
+        copy parameter values between C++ and Python end.
+    :type __gradient_machines__: list
+    :param __tmp_params__: a dict to store dummy parameters if no
+        __gradient_machines__ is appended to `Parameters`.
+    :type __tmp_params__: dict
    Basically usage is
@@ -62,7 +79,7 @@ class Parameters(object):
    """
    def __init__(self):
-        self.__param_conf__ = dict()
+        self.__param_conf__ = OrderedDict()
        self.__gradient_machines__ = []
        self.__tmp_params__ = dict()
@@ -231,6 +248,9 @@ class Parameters(object):
        :rtype: np.ndarray
        """
        import py_paddle.swig_paddle as api
+        if self.__param_conf__[key].is_static:
+            return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
    def set(self, parameter_name, value):
@@ -250,7 +270,7 @@ class Parameters(object):
        append gradient machine to parameters. This method is used internally in
        Trainer.train.
-        :param gradient_machine: Paddle C++ GradientMachine object.
+        :param gradient_machine: PaddlePaddle C++ GradientMachine object.
        :type gradient_machine: api.GradientMachine
        :return:
        """

--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -141,12 +141,13 @@ class CostLayerTest(unittest.TestCase):
        cost8 = layer.rank_cost(left=score, right=score, label=score)
        cost9 = layer.lambda_cost(input=inference, score=score)
        cost10 = layer.sum_cost(input=inference)
-        cost11 = layer.huber_cost(input=score, label=label)
+        cost11 = layer.huber_regression_cost(input=score, label=label)
+        cost12 = layer.huber_classification_cost(input=score, label=label)
        print layer.parse_network([cost1, cost2])
        print layer.parse_network([cost3, cost4])
        print layer.parse_network([cost5, cost6])
-        print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
+        print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
        crf = layer.crf(input=inference, label=label)
        crf_decoding = layer.crf_decoding(input=inference, size=3)