diff --git a/Dockerfile b/Dockerfile
index 136db772cc6a24b8084120fa6bab666bc1eda78e..150344a8116e2be9b5bab8e5fdcc9c37f4025020 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
diff --git a/doc/design/block.md b/doc/design/block.md
index 7cbf0d55b1faeb2093ee7cf234d1c2ad1905885b..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -189,7 +189,7 @@ OpDesc {
   inputs = {0} // the index of x in vars of BlockDesc above
   outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
   attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
     "step_net" : <above step net>
   }
 };
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
index 42384a3f059966e22e22f5fa4295cc9ead5cef83..43415ed8cf61a5acfa34f8e56b9577f338dbf254 100644
Binary files a/doc/design/cluster_train/src/trainer.graffle and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
index 9f1ce4bae7b393cb9f04909e5e4917b8d660771c..8d973eb53178c3e889c845144553a453e11f067c 100644
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@@ -3,17 +3,17 @@
 
 ## The Problem Posed
 
-Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance.
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
 
-However, we noticed two problems with the current deisgn:
+However, we noticed two problems with the current design:
 
-1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
 
-1. Some operator's gradient computation requires more than one gradient operators.  For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator.  So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation.
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
 
 ## The Current Implementation
 
-The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
 
 ```cpp
 struct OpInfo {
@@ -31,16 +31,16 @@ OperatorBase* CreateGradientOperator(const OperatorBase& op) {
 
 ## Proposed Solution
 
-The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
 
 ```cpp
 // (OpDesc) --> vector<OpDesc>
 std::function<std::vector<OpDescBind>(const OpDescBind&)>;
 ```
 
-The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
 
-The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
 
 ```cpp
 struct OpInfo {
@@ -49,7 +49,7 @@ struct OpInfo {
 };
 ```
 
-The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
 
 We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
 
@@ -74,7 +74,7 @@ func = [] (const OpDescBind& fwd_op) {
 
 We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
 
-We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
 
 The user interface should be
 
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 75c4ba028e497e29e9030a86514348726d9c0a80..0e939a2671ace8682c90cdc1c1bb2da1dda0d568 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
 1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
 2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
 
-除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
 
 5.  如何调用 infer 接口输出多个layer的预测结果
 -----------------------------------------------
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 274452fbf0c595ad7b4dbeffe85ad9038f12b458..93c5544bcfa911f8bdcdaea39a75b3ab7ef218f8 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,135 +1,215 @@
-```eval_rst
-.. _cluster_train:
+# PaddlePaddle分布式训练
+
+* [概述](#概述)
+* [环境准备](#环境准备)
+* [启动参数说明](#启动参数说明)
+  * [启动参数服务器](#启动参数服务器)
+  * [启动计算节点](#启动计算节点)
+  * [准备数据集](#准备数据集)
+  * [准备训练程序](#准备训练程序)
+* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
+  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
+     * [准备一个Linux集群](#准备一个linux集群)
+     * [启动集群作业](#启动集群作业)
+     * [终止集群作业](#终止集群作业)
+     * [检查集群训练结果](#检查集群训练结果)
+     * [检查模型输出](#检查模型输出)
+  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
+     * [准备OpenMPI集群](#准备OpenMPI集群)
+     * [启动集群作业](#启动集群作业-1)
+  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+
+# 概述
+本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+# 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
 ```
 
-# 运行分布式训练
+下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
+# 启动参数说明
+## 启动参数服务器
+执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
+如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-## 前提条件
+| 参数  | 是否必选 | 默认值 | 说明 |
+| ------------- | ------------- | ------------- | ------------- |
+| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
+| ports_num  | 必选 | 1 | 监听的端口个数  |
+| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
+| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+
+## 启动计算节点
+执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+```bash
+$ python train.py
+```
 
-1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric:
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量（https://zh.wikipedia.org/wiki/环境变量 ）或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
 
-   ```bash
-   pip install fabric
-   ```
+使用环境变量：
 
-2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+```
 
-3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`， 该 ROOT_DIR 要在所有节点上存在。为了方便起见，我们通常在所有节点上创建一个 Unix 用户 `paddle`，并设置 `ROOT_DIR=/home/paddle`。这样，我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`，以便用户 `paddle` 可以 SSH 到所有节点而不用密码。
+使用参数：
 
-## 准备工作空间
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
 
-我们将放置依赖库、配置等文件的目录视为 *工作空间（workspace）*。
+| 参数  | 是否必选 | 默认 | 说明 |
+| ------------- | ------------- | ------------- | ------------- |
+| use_gpu  | 可选 | False | 是否启用GPU训练 |
+| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
+| port  | 必选 | 7164 | 连接到pserver的端口  |
+| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
+| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
+| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
+| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
 
-这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求，PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据，所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件，并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。
 
-通常，你可以使用本地训练中的相同模型文件进行集群训练。请记住，在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小，而不是使用同步 SGD 的总 batch 大小。
+## 准备数据集
 
-以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。
+参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
 
-你只需完成 demo/recommendation 教程文档到 `Train` 的部分，之后你会得到训练/测试数据和模型配置文件。最后，只需使用 demo/recommendation 作为集群训练的工作空间。
+在线上系统中，通常会使用MapReduce任务的输出结果作为训练结果，这样训练文件的个数会比较多，而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件：
 
-最后，你的工作空间应如下所示：
-```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
 ```
-虽然这些文件并非都需要集群训练，但是也没有必要删除无用的文件。
-
-`trainer_config.py`
-表示模型配置文件。
 
-`train.list` 和 `test.list`
-文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。
+示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
+```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
 
-`dataprovider.py`
-用于读取训练/测试样本。这与本地训练相同。
+在进行分布式训练时，每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中，系统会提供一个分布式存储服务，这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储，则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
 
-`data`
-数据目录中的所有文件被 train.list/test.list 引用。
+对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
 
+## 准备训练程序
 
-## 准备集群作业配置
+我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
 
-以下选项必须在 cluster_train/conf.py 中认真设置
+最后，工作空间应如下所示：
+```
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
 
-`HOSTS`  所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上，例如 root@192.168.100.17:9090。
+- `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
+- `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
 
-`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称，例如以太网的 eth0，infiniband 的 ib0。
+- `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
+- `test_data_dir`：包含测试数据集的目录。
 
-`PADDLE_PORT` 集群通信通道的端口号
+# 使用分布式计算平台或工具
 
-`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少（少于5〜6个节点），建议将其设置为较大，如2〜8，以获得更好的网络性能。
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
+- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
+- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
+- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update，则可以像 `PADDLE_PORTS_NUM` 一样设置。
+对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
 
-`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。
+在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-默认配置如下：
+## 使用Fabric启动集群作业
 
-```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-工作空间配置
-'''
-
-#工作空间根目录
-ROOT_DIR = "/home/paddle"
-
-'''
-网络配置
-'''
-#pserver NIC
-PADDLE_NIC = "eth0"
-#pserver 端口
-PADDLE_PORT = 7164
-#pserver 端口数
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#集群作业中所有进程的环境设置
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
-```
+### 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
 
 ### 启动集群作业
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
 
 `paddle.py` 为方便作业启动提供了两个独特的命令选项。
 
-`job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 conf.py 中设置的所有节点。  它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
-`job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
 
-`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作，只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
 ```
 sh run.sh
 ```
@@ -149,7 +229,7 @@ sh run.sh
 提供 pserver 运行日志，有助于诊断分布式错误。
 
 `server.log`
-提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
 
 `train.log`
 提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
@@ -157,3 +237,49 @@ sh run.sh
 ### 检查模型输出
 运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
 工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
+
+## 在OpenMPI集群中提交训练作业
+
+### 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+### 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
+
+## 在Kubernetes集群中提交训练作业
+
+此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index c60876721cbf5565d6e48c8061811aacada748cd..1e8b4d54b9ffa99b3beef35ecaf95bbd0866535f 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,129 +1,220 @@
-# Run Distributed Training
+# PaddlePaddle Distributed Training
+
+* [Introduction](#introduction)
+* [Preparations](#preparations)
+* [Command-line arguments](#command-line-arguments)
+   * [Starting parameter server](#starting-parameter-server)
+   * [Starting trainer](#starting-trainer)
+   * [Prepare Training Dataset](#prepare-training-dataset)
+   * [Prepare Training program](#prepare-training-program)
+* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
+   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
+      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
+      * [Launching Cluster Job](#launching-cluster-job)
+      * [Kill Cluster Job](#kill-cluster-job)
+      * [Check Cluster Training Result](#check-cluster-training-result)
+      * [Check Model Output](#check-model-output)
+   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
+      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
+      * [Launching Cluster Job](#launching-cluster-job-1)
+   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+
+# Introduction
+
+In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
+# Preparations
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
 
-In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
+We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
+# Command-line arguments
 
-## Prerequisite
+## Starting parameter server
 
-1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
+Type the below command to start a parameter server which will wait for trainers to connect:
 
-   ```bash
-   pip install fabric
-   ```
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
 
-1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
+If you wish to run parameter servers in background, and save a log file, you can type:
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
+| param  | required | default | description |
+| ------------- | ------------- | ------------- | ------------- |
+| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
+| ports_num  | required | 1 | total number of ports will listen on  |
+| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
+| num_gradient_servers  | required | 1 | total number of gradient servers |
 
-## Prepare Job Workspace
+## Starting trainer
+Type the command below to start the trainer(name the file whatever you want, like "train.py")
 
-We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
+```bash
+$ python train.py
+```
 
-These `train/test` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
 
-Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used.
+Use environment viriables:
 
-Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory.
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+python train.py
+```
 
-You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
+Pass arguments:
 
-At last your workspace should look like as follow:
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
 ```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
+
+| param  | required | default | description |
+| ------------- | ------------- | ------------- | ------------- |
+| use_gpu  | optional | False | set to "True" to enable GPU training |
+| trainer_count  | required | 1 | total count of trainers in the training job |
+| port  | required | 7164 | port to connect to parameter server  |
+| ports_num  | required | 1 | number of ports for communication |
+| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
+| num_gradient_servers  | required | 1 | total number of gradient server |
+| trainer_id  | required | 0 | ID for every trainer, start from 0 |
+| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+
+## Prepare Training Dataset
+
+Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
+
+In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
+
+```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
 ```
-Not all of these files are needed for cluster training, but it's not necessary to remove useless files.
 
-`trainer_config.py`
-Indicates the model config file.
+When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
 
-`train.list` and `test.list`
-File index. It stores all relative or absolute file paths of all train/test data at current node.
+Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
 
-`dataprovider.py`
-used to read train/test samples. It's same as local training.
+## Prepare Training program
 
-`data`
-all files in data directory are refered by train.list/test.list which are refered by data provider.
+We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
 
 
-## Prepare Cluster Job Configuration
+Your workspace may looks like:
+```
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
 
-The options below must be carefully set in cluster_train/conf.py
+- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
+- `word_dict.pickle`: dict file for training word embeding.
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
 
-`HOSTS`  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory
+- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
+- `test_data_dir`: containing testing data.
 
-`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+# Use cluster platforms or cluster management tools
 
-`PADDLE_PORT` port number for cluster commnunication channel
+PaddlePaddle supports running jobs on several platforms including:
+- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
+- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
 
-`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM`
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
+## Cluster Training Using Fabric
 
-Default Configuration as follow:
+### Prepare a Linux cluster
 
-```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-workspace configuration
-'''
-
-#root dir for workspace
-ROOT_DIR = "/home/paddle"
-
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
-```
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
 
 ### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
 
 `paddle.py`provides two distinguished command option for easy job launching.
 
-`job_dispatch_package`  set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
-`job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
 dispatch latency.
 
 `cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
@@ -134,23 +225,69 @@ sh run.sh
 The cluster Job will start in several seconds.
 
 ### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed.
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
 
 ### Check Cluster Training Result
 Check log in $workspace/log for details, each node owns same log structure.
 
 `paddle_trainer.INFO`
-It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
 
 `paddle_pserver2.INFO`
-It provides pserver running log, which could help to diagnose distributed error.
+It provides parameter server running log, which could help to diagnose distributed error.
 
 `server.log`
-It provides stderr and stdout of pserver process. Check error log if training crashs.
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
 
 `train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashs.
+It provides stderr and stdout of trainer process. Check error log if training crashes.
 
 ### Check Model Output
-After one pass finished, model files will be writed in `output` directory in node 0.
+After one pass finished, model files will be written in `output` directory in node 0.
 `nodefile` in workspace indicates the node id of current cluster job.
+
+## Cluster Training Using OpenMPI
+
+### Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+### Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
+
+## Cluster Training Using Kubernetes
+
+The details can be found [here](../k8s/k8s_cn.md)
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/usage/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer.png differ
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/usage/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0940f0e56eafa22f8aeb7052c0ddc79d8862917
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
@@ -0,0 +1,100 @@
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6d8887124a5524505b097803a60a35478ca644
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
@@ -0,0 +1,123 @@
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+
+    return cluster_reader
+
+
+def main():
+    # get arguments from env
+
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/usage/cluster/src/word2vec/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f5c5b26d37ea03de3ab4dc2d967a4bd009eef0
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
@@ -0,0 +1,41 @@
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+
+SPLIT_COUNT = 3
+N = 5
+
+
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+
+
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 2c458a78c598bf206b30c0c07599ce605af77701..e767856d5012fd205f6b57f9721d0cbca8dc46ed 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -28,23 +28,37 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 
 add_dependencies(paddle_capi paddle_proto)
 
-# combine all paddle static libraries together, into libpaddle_capi_whole.a
-# user should use PaddleCAPI as -lpaddle_capi_whole
-set(PADDLE_CAPI_INFER_LIBS
-    paddle_utils
-    paddle_parameter
-    paddle_math
-    paddle_cuda
-    paddle_function
-    paddle_gserver
-    paddle_proto)
-
+# TODO: paddle_capi_whole will be removed.
+if(MOBILE_INFERENCE)
+    set(PADDLE_CAPI_INFER_LIBS
+        paddle_utils
+        paddle_parameter
+        paddle_math
+        paddle_cuda
+        paddle_function
+        paddle_gserver
+        paddle_proto)
+else()
+    set(PADDLE_CAPI_INFER_LIBS
+        paddle_utils
+        paddle_parameter
+        paddle_math
+        paddle_cuda
+        paddle_function
+        paddle_gserver
+        paddle_proto
+        paddle_pserver
+        paddle_network)
+endif()
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
-# No shared library for iOS
+# Link the static library for inference
+cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
+cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+
+# Link the shared library for inference
 if(NOT IOS)
-  set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
-  # TODO: merge mkl into paddle_capi_shared
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
   add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
   set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@@ -53,9 +67,10 @@ endif()
 
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES paddle_capi.map DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_shared
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
           ARCHIVE DESTINATION lib/${ANDROID_ABI}
           LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
@@ -80,7 +95,7 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
   if(NOT IOS)
     install(TARGETS paddle_capi_shared DESTINATION lib)
   endif()
diff --git a/paddle/capi/export.sym b/paddle/capi/export.sym
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddle/capi/export.map b/paddle/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/export.map
rename to paddle/capi/paddle_capi.map
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 6e32a1c99ba6b8bfac12c227e0cf66e0a9f16557..dbe76a8eaf134f7db08fb545297c8e4db68a7aab 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,15 +19,15 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc glog)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)
 
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
@@ -43,7 +43,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index fb552fe3448b3f17e97e1262b5c9a0842f68f8b9..1ae7fb60f01e4925ceb310f661171eb231eb6c96 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -21,6 +21,7 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 
@@ -220,8 +221,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
     // process recurrent gradient op as a special operator.
     if (forwardOp.Type() == "recurrent") {
       // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or
-      // this will result in infinite loop.
+      // or this will result in infinite loop.
       const auto& rnnop =
           *static_cast<const operators::RecurrentOp*>(&forwardOp);
       auto rnn_grad_op =
@@ -231,6 +231,18 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       // create stepnet's gradient op
       rnn_grad_op->set_stepnet(
           BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
+    } else if (forwardOp.Type() == "dynamic_recurrent") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
+      // create stepnet's gradient op
+      rnn_grad_op->rnn.SetStepUnit(
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
     }
 
     if (net->ops_.empty()) {  // Current no aux op is added to network
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 21d4fdaf0680036a484ee4258e47c6c8854967c3..251e340e6ddcc17ba16bdcab63f2a8c907122eab 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -41,6 +41,19 @@ bool BlockDescBind::HasVar(const std::string &name) const {
   return vars_.find(name) != vars_.end();
 }
 
+VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return Parent() == kNoneBlockIndex ? nullptr
+                                       : ParentBlock()->FindVarRecursive(name);
+  }
+  return it->second.get();
+}
+
+bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+  return FindVarRecursive(name) != nullptr;
+}
+
 std::vector<VarDescBind *> BlockDescBind::AllVars() const {
   std::vector<VarDescBind *> res;
   for (const auto &p : vars_) {
@@ -97,7 +110,7 @@ void BlockDescBind::Flush() {
 }
 
 BlockDescBind *BlockDescBind::ParentBlock() const {
-  if (this->desc_->parent_idx() == -1) {
+  if (this->desc_->parent_idx() == kNoneBlockIndex) {
     return nullptr;
   }
   return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 7d1d33f6860aa90518abb379a5e9964d6029c6fa..c685050850dc25f346df49b5ce1d897974870460 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/framework/var_desc.h"
 #include "paddle/platform/macros.h"
 
@@ -56,6 +57,10 @@ class BlockDescBind {
 
   bool HasVar(const std::string &var_name) const;
 
+  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+
+  bool HasVarRecursive(const std::string &var_name) const;
+
   std::set<std::string> LocalVarNames() const {
     std::set<std::string> var_names;
     for (auto &var : vars_) {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 649899d42572c9a22adca5337dcd56b0bcf42e7c..c25a62c2b11ead614d93a4be8d63d40d0cc0165a 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) {
     return DataType::FP64;
   } else if (typeid(int).hash_code() == type.hash_code()) {
     return DataType::INT32;
+  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+    return DataType::INT64;
   } else {
     PADDLE_THROW("Not supported");
   }
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 00caa6e1d53a4bcfae56c4459413bc1622321960..1f1e4edda823d62b169422672c855d96a2bd2ede 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -68,9 +68,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
 
   for (auto& var : block.vars()) {
     if (var.persistable()) {
-      scope->Var(var.name());
+      auto* ptr = scope->Var(var.name());
+      VLOG(3) << "Create Variable " << var.name()
+              << " global, which pointer is " << ptr;
     } else {
-      local_scope.Var(var.name());
+      auto* ptr = local_scope.Var(var.name());
+      VLOG(3) << "Create Variable " << var.name()
+              << " locally, which pointer is " << ptr;
     }
   }
 
@@ -80,8 +84,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
     op->Run(local_scope, *device);
   }
 
-  // TODO(tonyyang-svail):
-  //  - Destroy local_scope
+  scope->DeleteScope(&local_scope);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
index 826d180bfc5445224a8d9292f06eeb58d9a46b29..7feacb1e24708411e7fbb610f9909447cba9e291 100644
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
@@ -13,37 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "glog/logging.h"
+#include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
-template <typename T>
-void SetFeedVariable(const LoDTensor& input, const std::string& var_name,
-                     size_t index) {
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
   // be created.
-  Variable* g_feed_value = GetGlobalScope().Var(var_name);
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
   auto& feed_inputs =
       *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
   // shared data with input tensor
-  feed_inputs[index].ShareDataWith<T>(input);
+  feed_inputs[index].ShareDataWith(input);
   // set lod
   feed_inputs[index].set_lod(input.lod());
 }
 
-LoDTensor& GetFetchVariable(const std::string& var_name, size_t index) {
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
   // Since we want to fetch LodTensor from a variable, the variable must
   // be created alreadly.
-  Variable* g_fetch_value = GetGlobalScope().FindVar(var_name);
-  auto& fetch_outputs =
-      *(g_fetch_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
   PADDLE_ENFORCE_LT(index, fetch_outputs.size());
-  return fetch_outputs[index];
+  return tensor;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 008fb45fb7bcb2f9b3d02376b15d2f88515f86d9..3d023535ef6c49326481ec7edc2bfc9d7c0d4ffa 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -68,6 +68,7 @@ message OpProto {
 
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
   }
 
   // AttrProto describes the C++ type Attribute.
@@ -112,6 +113,8 @@ message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
     SELECTED_ROWS = 2;
+    FEED_MINIBATCH = 3;
+    FETCH_LIST = 4;
   }
   required string name = 1;
   required VarType type = 2;
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 6f1e1b870bcd29b928e0723fbeb3583cbcbef559..a7b2b5b1ec8c1ce3c2973cd17aca6c427c86af97 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   for (size_t i = level_begin; i < level_end; i++) {
     new_lod.emplace_back(in.at(i));
   }
+  // transform the lowest level to absolute offset.
+  LoD abs_offset_lod = ToAbsOffset(in);
+  new_lod.back() = abs_offset_lod[level_end - 1];
   return new_lod;
 }
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end) {
-  // slice the lod.
-  LoD new_lod;
-  new_lod.reserve(in.size() - level);
-  auto start = in.at(level)[elem_begin];
-  auto end = in.at(level)[elem_end];
-
-  for (auto it = in.begin() + level; it != in.end(); it++) {
-    auto it_begin = std::find(it->begin(), it->end(), start);
-    auto it_end = std::find(it_begin, it->end(), end);
-    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
-    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
-    new_lod.emplace_back(it_begin, it_end + 1);
-    // reset offset if tensor is copyed and sliced.
-    std::transform(new_lod.back().begin(), new_lod.back().end(),
-                   new_lod.back().begin(),
-                   [start](int v) { return v - start; });
-    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
+  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+  LoD res;
+  res.resize(in.size() - level);
+  // copy the first level
+  res[0].assign(in[level].begin() + elem_begin,
+                in[level].begin() + elem_end + 1);
+  for (size_t lvl = 1; lvl < res.size(); lvl++) {
+    const auto& in_level = in[level + lvl];
+    const auto& above_level = res[lvl - 1];
+    auto& out_level = res[lvl];
+    out_level.assign(in_level.begin() + above_level.front(),
+                     in_level.begin() + above_level.back() + 1);
   }
-  PADDLE_ENFORCE_LE(new_lod.size(), in.size());
-  return new_lod;
+  for (size_t lvl = 0; lvl < res.size(); lvl++) {
+    // to make the first offset equals 0, all the elements minus the first
+    // element
+    size_t front = res[lvl].front();
+    for (auto& ele : res[lvl]) {
+      ele -= front;
+    }
+  }
+  return res;
+}
+
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (int level = result.size() - 2; level >= 0; level--) {
+    for (auto& ele : result[level]) {
+      ele = result[level + 1][ele];
+    }
+  }
+  return result;
 }
 
 bool operator==(const LoD& a, const LoD& b) {
@@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) {
 size_t LoDTensor::NumElements(size_t level, size_t idx) const {
   PADDLE_ENFORCE_LT(level, NumLevels());
   PADDLE_ENFORCE_LT(idx, NumElements(level));
-  // the last level of LoD, just return number of records in Tensor
-  if (level == NumLevels() - 1) {
-    return lod_[level][idx + 1] - lod_[level][idx];
-  }
-  // high level of LoD, and there is another lower level, return number of
-  // lower-level elements
-  auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
-  PADDLE_ENFORCE_GE(tmp.size(), 2);
-  // there is a 0 as a placeholder stored in LoD, so the number of elements
-  // equals lod.size() - 1
-  return tmp[1].size() - 1;
+  return lod_[level][idx + 1] - lod_[level][idx];
 }
 
 void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 4d1ec29f6001c853b118f02e771f652f3219073e..ec0b34878b01ebf36705c3adf9e1889a8e223f86 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -39,23 +39,36 @@ using Vector = thrust::host_vector<
 #endif
 
 /*
- * 3-level LoD stores
+ * LoD is short for Level of Details.
  *
- * 0 10 20
- * 0 5 10 15 20
- * 0 2 5 7 10 12 15 20
- *
- * - in a level, each element indicates offset in the underlying Tensor
+ * - in a level, each element indicates relative offset of the lower level
  * - the first element should be 0 and that indicates that this sequence start
  * from 0
  * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
  */
 using LoD = std::vector<Vector<size_t>>;
 
+/*
+ * Slice levels from a LoD.
+ * NOTE the lowest level should always be the absolute offsets of the underlying
+ * tensor instances. So if higher layers are sliced without the lowest level,
+ * the lower level of the sliced LoD will be transformed to the absolute offset.
+ */
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD& in);
 
 bool operator==(const LoD& a, const LoD& b);
 
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 44f09f584fb752d7003baa804979f3bb5cd9d651..e1e15abecf5534fb4fd94f7e2b65230c74d175de 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test {
     // 0 5 10 15 20
     // 0 2 5 7 10 12 15 20
     LoD lod;
-    lod.push_back(std::vector<size_t>{0, 10, 20});
-    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod.push_back(std::vector<size_t>{0, 2, 3});
+    lod.push_back(std::vector<size_t>{0, 2, 5, 8});
     lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
 
     ASSERT_EQ(lod.size(), 3UL);
@@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
 
 TEST_F(LoDTensorTester, NumElements) {
   ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
   ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
 
 TEST_F(LoDTensorTester, NumElements2) {
   ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
 }
 
 TEST_F(LoDTensorTester, ShrinkLevels) {
@@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
     LoDTensor new_lod_tensor = lod_tensor_;
     new_lod_tensor.ShrinkLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
   }
   // shrink 2 level
   for (size_t level = 0; level < 2UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
     new_lod_tensor.ShrinkLevels(level, level + 2);
+    // the lowest level's last element should be the tensor's batch_size.
+    ASSERT_EQ(new_lod_tensor.lod().back().back(),
+              lod_tensor_.lod().back().back());
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor_.NumElements(level + 1));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
   }
 }
@@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
 TEST_F(LoDTensorTester, ShrinkInLevel) {
   size_t level = 0;
   LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 1);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL);
   ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 
   level = 1;
   new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 1, 2);
   ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
   ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 }
 
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index a134befd90a1eaeff6f6ea62f11412df63cdc394..44e8ab16895cc604f85bb83e240eab55739f8ba0 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -44,6 +44,11 @@ class OpProtoAndCheckerMaker {
       var_->set_intermediate(true);
       return *this;
     }
+
+    VariableBuilder& AsDispensable() {
+      var_->set_dispensable(true);
+      return *this;
+    }
   };
 
   VariableBuilder AddInput(const std::string& name, const std::string& comment);
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 2fca816f353635d3bff184323755961ee82fbb68..a67625fa88fd2fbe4db43241ee824519ceac7017 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -252,5 +252,20 @@ std::ostream& operator<<(std::ostream& os,
   return os;
 }
 
+bool OpSupportGPU(const std::string& op_type) {
+  auto& all_kernels = OperatorWithKernel::AllOpKernels();
+  auto it = all_kernels.find(op_type);
+  if (it == all_kernels.end()) {
+    // All control operator must support GPU
+    return true;
+  }
+  for (auto& kern_pair : it->second) {
+    if (platform::is_gpu_place(kern_pair.first.place_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 12cd307297d010201a47e048089ed7be0db52647..0d0304ac9e13089ef533b0a47f0ec989c8fd7078 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -327,37 +327,47 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   bool HasInput(const std::string& name) const override {
     const std::vector<std::string>& input_names = op_.Input(name);
     auto length = input_names.size();
+    if (length == 0) {
+      return false;
+    }
     PADDLE_ENFORCE_EQ(length, 1UL,
                       "Input(%s) should have only one value, "
                       "but it have %d now",
                       name, length);
-    return block_.HasVar(input_names[0]);
+    return block_.HasVarRecursive(input_names[0]);
   }
 
   bool HasOutput(const std::string& name) const override {
     const std::vector<std::string>& output_names = op_.Output(name);
     auto length = output_names.size();
+    if (length == 0) {
+      return false;
+    }
     PADDLE_ENFORCE_EQ(length, 1UL,
                       "Output(%s) should have only one value, "
                       "but it have %d now",
                       name, length);
-    return block_.HasVar(output_names[0]);
+    return block_.HasVarRecursive(output_names[0]);
   }
 
   bool HasInputs(const std::string& name) const override {
     const std::vector<std::string>& input_names = op_.Input(name);
-    PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
+    if (input_names.empty()) {
+      return false;
+    }
     for (auto& input : input_names) {
-      if (!block_.HasVar(input)) return false;
+      if (!block_.HasVarRecursive(input)) return false;
     }
     return true;
   }
 
   bool HasOutputs(const std::string& name) const override {
     const std::vector<std::string>& output_names = op_.Output(name);
-    PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
+    if (output_names.empty()) {
+      return false;
+    }
     for (auto& output : output_names) {
-      if (!block_.HasVar(output)) return false;
+      if (!block_.HasVarRecursive(output)) return false;
     }
     return true;
   }
@@ -404,11 +414,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
  private:
   DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.FindVar(name)->Shape());
+    return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
   }
 
   void SetDim(const std::string& name, const DDim& dim) override {
-    block_.FindVar(name)->SetShape(framework::vectorize(dim));
+    block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
   }
 
   const OpDescBind& op_;
@@ -421,13 +431,27 @@ class RuntimeInferShapeContext : public InferShapeContext {
       : op_(op), scope_(scope) {}
 
   bool HasInput(const std::string& name) const override {
-    auto ipt = op_.Input(name);
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
 
   bool HasOutput(const std::string& name) const override {
-    auto ipt = op_.Output(name);
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
@@ -649,5 +673,7 @@ class OperatorWithKernel : public OperatorBase {
 std::ostream& operator<<(std::ostream& os,
                          const OperatorWithKernel::OpKernelKey& kernel_key);
 
+extern bool OpSupportGPU(const std::string& op_type);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index e2349cefe09a6c1e0b11f77775426fe5c7594c9d..8e99bba81117c9cc50227122527d6ab9a421c251 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -35,8 +35,8 @@ ProgramDesc *ProgramDescBind::Proto() {
 
 ProgramDescBind::ProgramDescBind() {
   auto *block = prog_.mutable_blocks()->Add();
-  block->set_idx(0);
-  block->set_parent_idx(-1);
+  block->set_idx(kRootBlockIndex);
+  block->set_parent_idx(kNoneBlockIndex);
   blocks_.emplace_back(new BlockDescBind(this, block));
 }
 
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 20cc1a2325ffd6f8ef52879a749f106c268376d4..dc4cd7cc735b5e4e3466d9b82dc5eb8647c80ef9 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"
 
 namespace paddle {
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 32ee275429687ae079ae8e15b3133428c6ff01b9..c9709a2d3f1d9e0be2bda1e8e9e7835ca49141b1 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -80,4 +80,4 @@ TEST(ProgramDesc, copy_ctor) {
   // different and it is correct.
 }
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/proto_desc.h b/paddle/framework/proto_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa01224fefce50eb3688ff407f0a7c948c5b7cfc
--- /dev/null
+++ b/paddle/framework/proto_desc.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIndex = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIndex = -1;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 5bf5e91f25ab1d920ae368eaf2000fce77d2eb07..ac3ac649f96c492852a3bd69be69487736a4ddd7 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -65,12 +65,11 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
-framework::Scope& GetGlobalScope() {
-  static framework::Scope* g_scope = nullptr;
-  if (g_scope == nullptr) {
-    g_scope = new framework::Scope();
-  }
-  return *g_scope;
+void Scope::DeleteScope(Scope* scope) {
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  this->kids_.erase(it);
+  delete scope;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a7fce3514b163d78bf96b3cc19d188744a383395..7206b53068bac3e16db385abc76359dc45a582df 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -59,6 +59,8 @@ class Scope {
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
 
+  void DeleteScope(Scope* scope);
+
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
@@ -72,8 +74,5 @@ class Scope {
 
   DISABLE_COPY_AND_ASSIGN(Scope);
 };
-
-framework::Scope& GetGlobalScope();
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index bc430852de6384ce8a02780d4e90787d58f5574c..3a2bdaf086372d5d0b07cf260feb2ee6f3cfb508 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -60,6 +60,10 @@ class Tensor {
   template <typename T>
   inline T* mutable_data(platform::Place place);
 
+  inline void* mutable_data(platform::Place place, std::type_index type);
+
+  inline void* mutable_data(platform::Place place);
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -81,7 +85,6 @@ class Tensor {
   inline Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  template <typename T>
   inline Tensor& ShareDataWith(const Tensor& src);
 
   /**
@@ -96,26 +99,9 @@ class Tensor {
   // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
   // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
   // and make them global functions
-  template <typename T>
   inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
                        const platform::DeviceContext& ctx);
 
-  // FIXME(yuyang18): CopyFrom should without template T, use the replace
-  // `CopyFrom` with `CopyFromTensor`
-  inline void CopyFromTensor(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-    // NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
-    // That is a bug of cpplint.py. Just ignore lint these lines.
-    if (src.type() == std::type_index(typeid(double))) {
-      CopyFrom<double>(src, dst_place, ctx);
-    } else if (src.type() == std::type_index(typeid(float))) {
-      CopyFrom<float>(src, dst_place, ctx);
-    } else if (src.type() == std::type_index(typeid(int))) {
-      CopyFrom<int>(src, dst_place, ctx);
-    }
-  }
-
   /**
    * @brief   Copy the content of an external vector to a tensor.
    *
@@ -135,7 +121,6 @@ class Tensor {
    * @param[in] begin_idx   The begin index of the slice.
    * @param[in] end_idx     The end index of the slice.
    */
-  template <typename T>
   inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
   platform::Place place() const {
@@ -146,7 +131,6 @@ class Tensor {
   std::type_index type() const { return holder_->type(); }
 
  private:
-  template <typename T>
   inline void check_memory_size() const;
 
  private:
@@ -155,20 +139,22 @@ class Tensor {
    *          parameter of Variable.
    */
   struct Placeholder {
-    virtual ~Placeholder() {}
+    virtual ~Placeholder() = default;
     virtual void* ptr() const = 0;
     virtual size_t size() const = 0;
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
+    virtual void set_type(std::type_index type) = 0;
   };
 
-  template <typename T, typename Place>
+  template <typename Place>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size)
-        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               memory::PODDeleter<T, Place>(place)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
           place_(place),
-          size_(size) {
+          size_(size),
+          type_(type) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -176,16 +162,20 @@ class Tensor {
     virtual size_t size() const { return size_; }
     virtual platform::Place place() const { return place_; }
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
 
     /*! the pointer of memory block. */
-    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
+    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
 
     /*! the place of memory block. */
     platform::Place place_;
 
     /*! the size of memory block. */
     size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
   };
 
   /*! holds the memory block if allocated. */
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
index 06459cbfd7b8c19c176452ff73c9f3a81ba1dc03..4c82c3638351c41df26503e2a26b5a4bb5822a67 100644
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
@@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
 
   values_[index].Resize(value.dims());
   values_[index].mutable_data<value_type>(platform::CPUPlace());
-  values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
-                                      platform::CPUDeviceContext());
+  values_[index].CopyFrom(value, platform::CPUPlace(),
+                          platform::CPUDeviceContext());
 }
 
 void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
     values_.resize(index + 1);
   }
 
-  values_[index].ShareDataWith<value_type>(value);
+  values_[index].ShareDataWith(value);
 }
 
 LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
@@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const {
   result.mutable_data<value_type>(platform::CPUPlace());
 
   for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice<value_type>(idx, idx + 1)
-        .CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
-                              platform::CPUDeviceContext());
+    result.Slice(idx, idx + 1)
+        .CopyFrom(Read(idx), platform::CPUPlace(),
+                  platform::CPUDeviceContext());
   }
   return result;
 }
@@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
     auto& value = values_[elem];
     if (data_shared) {
       // share memory
-      value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
+      value.ShareDataWith(source.Slice(elem, elem + 1));
     } else {
       // copy
       value.Resize(value_dims);
-      value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
-                                 platform::CPUPlace(),
-                                 platform::CPUDeviceContext());
+      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
+                     platform::CPUDeviceContext());
     }
   }
 }
@@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
 
   for (size_t i = 0; i < indice.size(); i++) {
     auto index = indice[i];
-    auto target = result.Slice<value_type>(i, i + 1);
-    auto slice = source->Slice<value_type>(index, index + 1);
+    auto target = result.Slice(i, i + 1);
+    auto slice = source->Slice(index, index + 1);
 
-    target.CopyFrom<value_type>(slice, platform::CPUPlace(),
-                                platform::CPUDeviceContext());
+    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
   }
 
   return result;
@@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
       // target is result[index]
       auto index = seq_meta.begin + batch_id;
       if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
-      auto target = result.Slice<float>(index, index + 1);
-      target.CopyFrom<float>(source_, platform::CPUPlace(),
-                             platform::CPUDeviceContext());
+      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
+      auto target = result.Slice(index, index + 1);
+      target.CopyFrom(source_, platform::CPUPlace(),
+                      platform::CPUDeviceContext());
     }
   }
 
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
index d9f52509cdd1b79f6d53b5d4922f9e44279de08b..9470ac5e6ed714d5ba63f3743e683af7f8edd4b0 100644
--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
@@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test {
       size_t begin = level[i];
       size_t end = level[i + 1];
       for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice<int>(j, j + 1);
+        auto record = source.Slice(j, j + 1);
         for (int dim = 0; dim < 128; dim++) {
           record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
         }
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index ce73e0a9edbe340f1165e2dbcba8c976c55df348..f6e801bbb4a056b5590da95a4b140cb90638f322 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -19,12 +19,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename... T>
+struct SizeOfTypeFunctor;
+
 template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  size_t size = functor(type);
+  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
   PADDLE_ENFORCE_GE(
-      holder_->size(), numel() * sizeof(T) + offset_,
+      holder_->size(), numel() * SizeOfType(type()) + offset_,
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
@@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {
 
 template <typename T>
 inline const T* Tensor::data() const {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }
 
 template <typename T>
 inline T* Tensor::data() {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+}
+
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
   PADDLE_ENFORCE_GT(numel(), 0,
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
+  int64_t size = numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
-  int64_t size = numel() * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
     } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-          boost::get<platform::GPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size, type));
     }
 #endif
     offset_ = 0;
   }
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+inline void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing");
+  return mutable_data(place, holder_->type());
 }
 
-template <typename T>
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size<T>();
+  src.check_memory_size();
   *this = src;
   return *this;
 }
 
-template <typename T>
 inline void Tensor::CopyFrom(const Tensor& src,
                              const platform::Place& dst_place,
                              const platform::DeviceContext& ctx) {
-  src.check_memory_size<T>();
+  src.check_memory_size();
   Resize(src.dims());
 
   auto src_place = src.holder_->place();
-  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto src_ptr = src.data<void>();
 
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto dst_ptr = mutable_data(dst_place, src.type());
 
-  auto size = src.numel() * sizeof(T);
+  auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
 #endif
 }
 
-template <typename T>
 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
-  check_memory_size<T>();
+  check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
   PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
@@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
     return dst;
   }
 }
@@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }
 
 inline int64_t Tensor::numel() const { return product(dims_); }
 
-template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
-  res.ShareDataWith<T>(src);
+  res.ShareDataWith(src);
   res.Resize(flatten_to_2d(src.dims(), num_col_dims));
   return res;
 }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 0b62fe08ce9e592384e55432861a943403453bb7..1bb0fb71b079940d35a995b78e04a531c074a8b2 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataWith<float>(src_tensor);
+      dst_tensor.ShareDataWith(src_tensor);
     } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
@@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
@@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 #endif
@@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
+    Tensor slice_tensor = src_tensor.Slice(1, 3);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -167,7 +167,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    Tensor slice_tensor = src_tensor.Slice(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) {
     memcpy(src_ptr, arr, 9 * sizeof(int));
 
     auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
+    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
 
     const int* dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(src_ptr, dst_ptr);
@@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
     const int* slice_ptr = slice_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(dst_ptr, slice_ptr);
@@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) {
     // CPU Tensor to GPU Tensor
     auto gpu_place = new paddle::platform::GPUPlace(0);
     CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
+    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
 
     // GPU Tensor to CPU Tensor
     auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
+    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
 
     // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
     // Sync before Compare Slice Tensors
     gpu_ctx.Wait();
@@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) {
     CUDADeviceContext gpu_ctx(*gpu_place);
     gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
     // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) {
     cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
     gpu_tensor.Resize(make_ddim({2, 2}));
     gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
   for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
     src_ptr[i] = i;
   }
-  Tensor res = ReshapeToMatrix<int>(src, 2);
+  Tensor res = ReshapeToMatrix(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index 38fc2720a3023039aa113b32a394bda9c5def4c0..a80f0e66b5a59bf95efc200d159ad5dd9cf4111a 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -25,7 +25,10 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
-    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
+    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
+    PADDLE_ENFORCE(IsType<T>(),
+                   "Variable must be type %s, the holding type is %s",
+                   typeid(T).name(), holder_->Type().name());
     return *static_cast<const T*>(holder_->Ptr());
   }
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 18c5638100065109fba1f0647a1c5f91256f7b9d..f3ccd68160859795f28a40f8d0d4032adb289ccf 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
   copyInVal_ = nullptr;
   if (act.grad && algo == algorithm::eltwise_tanh) {
     // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
     copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
     CHECK(copyInVal_) << "should not be emptry";
     pipelineFwd_.push_back(*copyInVal_);
@@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
   algorithm algo = getAlgo(this->getName());
   float alpha = getBwdAlpha();
   float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
   auto eng = CPUEngine::Instance().getEngine();
   auto bwdDesc = eltwise_bwd::desc(
       algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
     int ic = cnt_ / bs / ih / iw;
     CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
     val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
     CHECK(val_);
     val_->downSpatial();
   }
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index 4c0234e7b3a91053596c32cea581fa5d1e26b9d5..af02a37cad668708f77ecf423549a8ec993e54fb 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
   MKLDNN_BASE = 1,   // basical info of MKLDNN
   MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
   MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;
 
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 26810a648343d6203f7937740641325ae8ea6879..83f4e4e6151d727b3e6cf367bb7ecae55dd7df73 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdBuffers(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNConvLayer::updateInputData() {
-  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(pd);
-  resetInValue(pd, in);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
 
-  resetWgtBiasValue(pd, wgt, bias);
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
 
-  resetOutValue(pd, out);
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
 }
 
 void MKLDNNConvLayer::resetFwdPipeline(
@@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtInVal_) {
-    pipeline.push_back(*cvtInVal_);
-  }
-
   if (bias) {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
   }
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
-}
-
-void MKLDNNConvLayer::resetInValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
-
-  // create buffer and reorder if input value do not match
-  cpuInVal_ = nullptr;
-  cvtInVal_ = nullptr;
-
-  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
-  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-    in = dnnIn;
-    return;
-  }
-  if (dnnIn) {
-    if (dnnIn->getFormat() == format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
-      // create a new one with nchw format and same data
-      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
-      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    }
-    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-      in = dnnIn;
-      return;
-    }
-    cpuInVal_ = dnnIn;
-    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  } else {
-    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      // create new mkldnn matrix
-      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-      CHECK(cvtInVal_) << "should not be emptry";
-    } else {
-      in = cpuInVal_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
-             : nullptr;
-}
-
-void MKLDNNConvLayer::resetOutValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
-      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be empty";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    // when output is cpu device, change the mkldnn output value and make them
-    // share the same data. Then if next layer use inputlayer->getOuputValue()
-    // to achieve the input value, it will get the right data.
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
@@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
@@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(wgtPD);
-  resetOutGrad(wgtPD, out);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
 
-  resetWgtBiasGrad(wgtPD, wgt, bias);
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK(wgtVal_ != nullptr &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
 
-  resetInGrad(dataPD, in);
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias && biasVal_ &&
+          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+        << "primitive desc of bias grad should equal the bias value";
+  }
 
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
   resetWgtValBwdData(dataPD, wgtValBwdData_);
 }
 
@@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
-  }
-
+  CHECK(inVal_);
   // add bwdWgt handle
   if (bias) {
     bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
@@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
   if (dataPD == nullptr) {
     return;
   }
-
   if (cvtWgtVal_) {
     pipeline.push_back(*cvtWgtVal_);
   }
-
   // add bwdData handle
   CHECK(wgtValBwdData_) << "Should have weight memory";
   bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
   pipeline.push_back(*bwdData_);
-
-  if (cvtInGrad_) {
-    pipeline.push_back(*cvtInGrad_);
-  }
-}
-
-void MKLDNNConvLayer::resetOutGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_ != nullptr &&
-        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
-      << "primitive desc of out grad and value should be equal";
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuOutVal_);
-    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    // create reorder if primitive desc does not match
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_);
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
-                             wgtPD->diff_weights_primitive_desc());
-  CHECK(nullptr != wgtVal_ &&
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad and value should be equal";
-  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
-
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias = MKLDNNMatrix::create(biases_->getWGrad(),
-                              wgtPD->diff_bias_primitive_desc());
-  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
-      << "primitive desc of bias grad should equal the bias value";
-}
-
-void MKLDNNConvLayer::resetInGrad(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
-    CHECK(nullptr != inVal_ &&
-          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-        << "primitive desc of input grad and value should be equal";
-  } else {
-    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuInVal_);
-    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    in = cpuInGrad_;
-    // create reorder if PrimitiveDesc does not match
-    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
-      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
-                                dataPD->diff_src_primitive_desc());
-      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
-      CHECK(cvtInGrad_);
-    }
-  }
 }
 
 void MKLDNNConvLayer::resetWgtValBwdData(
@@ -537,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
   // since the primitive_desc would be different with wgtVal_
   CHECK(wgtVal_) << "should have weight value";
   if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ =
-        MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
     cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
     CHECK(cvtWgtVal_);
   } else {
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index f84f2f737c47a1b8adc2b83360a0396ffbc6ae24..1fed0e1c6565b763a3ee73a0853f560ddfbd44c6 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -48,17 +48,6 @@ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuInVal_;
-  MKLDNNMatrixPtr cpuInGrad_;
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // whether the weight has been init
   bool hasInitedWgt_;
 
@@ -94,8 +83,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -109,26 +96,6 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-  void printValueFormatFlow() override {
-    if (cpuInVal_) {
-      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
-    }
-    MKLDNNLayer::printValueFormatFlow();
-    if (cpuOutVal_) {
-      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormatFlow() override {
-    if (cpuInGrad_) {
-      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
-    }
-    MKLDNNLayer::printGradFormatFlow();
-    if (cpuOutGrad_) {
-      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
-    }
-  }
-
 protected:
   /**
    * load the dims settings of this conv
@@ -162,23 +129,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of input value
-   */
-  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                    MKLDNNMatrixPtr& in);
-  /**
-   * reset MKLDNNMatrix of weight and bias value
-   */
-  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                         MKLDNNMatrixPtr& wgt,
-                         MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of output value
-   */
-  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& out);
-
   /**
    * reset the backward weight primitive descriptor.
    */
@@ -207,22 +157,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of output grad
-   */
-  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                    MKLDNNMatrixPtr& out);
-  /**
-   * reset MKLDNNMatrix of weight and bias grad
-   */
-  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of input grad
-   */
-  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                   MKLDNNMatrixPtr& in);
   /**
    * reset MKLDNNMatrix of weight value for backward data
    * since the primitive_desc would be different with wgtVal_
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index cf19a155681f3a1ceb20af67245c8f2b8fa8fa73..d82063a7130ca928ba042e210eb216f90c7207cd 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdDataPD(bwdDataPD, in, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
   resetInValue(in);
-
-  resetWgtBiasValue(wgt, bias);
-
-  resetOutValue(out);
-}
-
-void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
+  CHECK(in);
   in->downSpatial();
-}
 
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
-                                      MKLDNNMatrixPtr& bias) {
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
   format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
     wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
     wgtFmt = format::oIhw16i;
   }
-  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
   wgt->downSpatial();
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
-             : nullptr;
-}
 
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert, just share data
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
   }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
   } else {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
   }
-
   pipeline.push_back(*fwd_);
 }
 
@@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetWgtBiasGrad(wgt, bias);
-
-  resetInGrad(in);
-}
-
-void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    output_.grad->setData(cpuOut->getData());
-    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
-  }
-}
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 
-void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-                                     MKLDNNMatrixPtr& bias) {
   CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
 
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias =
-      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-}
-
-void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
   }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index c76878aafab7e986d2bf478eaba02f2f0aced293..ee861763ff3dc10ddb4c119358b80dbe1614aecb 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -66,8 +66,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -84,9 +82,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr wgt,
@@ -109,9 +104,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                      MKLDNNMatrixPtr& wgt,
                      MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6bb19976b5552fcd2e420f03de45c77a90ffb9d2
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -0,0 +1,333 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+bool MKLDNNLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                          << "Please set WITH_MKLDNN=ON "
+                          << "and set use_mkldnn=True";
+  CHECK(!useGpu_) << "Do not support GPU yet";
+
+  // set device id before Layer::init
+  setDevice(MKLDNN_DEVICE);
+  // change param device to MKLDNN device
+  setParamsDevice(MKLDNN_DEVICE, parameterMap);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setOutputMap();
+  checkCPUOutputsNumber();
+
+  stream_.reset(new MKLDNNStream());
+  engine_ = CPUEngine::Instance().getEngine();
+  return true;
+}
+
+void MKLDNNLayer::forward(PassType passType) {
+  passType_ = passType;
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    CHECK(!inputLayers_.empty());
+    copySeqInfoToOutputs();
+    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
+    if (inputElemenCnt_ != elemenCnt) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+      // reset when input total sizes changed, not only the batchsize
+      inputElemenCnt_ = elemenCnt;
+      pipelineFwd_.clear();
+      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+      // all cpu device output grad or value share output's
+      shareCPUDevice();
+      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+      // MKLDNNLayer output value should be MKLDNNMatrix
+      // so external output value is necessary.
+      // Then external input value is not necessary,
+      // since input may be mkldnn internal buffer.
+      CHECK(extOutVal_) << "external output value is necessary";
+      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+      CHECK(inVal_ && outVal_) << "internal memories are necessary";
+      if (cvtInVal_) {
+        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+      }
+      if (cvtOutVal_) {
+        pipelineFwd_.push_back(*cvtOutVal_);
+      }
+      convertWeightsFromPaddle();
+      printSizeInfo();
+      printValueFormat();
+      needResetBwd_ = true;
+    }
+
+    if (inputLayers_[0]->getType() == "data") {
+      // Update input value data when input layer is "data" type,
+      // since the input value data address might be changed.
+      CHECK(extInVal_);
+      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+    }
+
+    if (!outputOnlyMKLDNN_) {
+      clearGrads();
+    }
+    stream_->submit(pipelineFwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNLayer::backward(const UpdateCallback& callback) {
+  if (needResetBwd_) {
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+    pipelineBwd_.clear();
+    pipelineMergeGrad_.clear();
+    mergeGrad_ = nullptr;
+    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+    }
+    if (cvtInGrad_) {
+      pipelineBwd_.push_back(*cvtInGrad_);
+    }
+    printGradFormat();
+    needResetBwd_ = false;
+  }
+
+  // merge grad must before backward activation
+  if (mergeGrad_) {
+    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+    stream_->submit(pipelineMergeGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    stream_->submit(pipelineBwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    updateWeights(callback);
+  }
+}
+
+void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+  const Argument& input = inputLayers_[0]->getOutput();
+  batchsize = input.getBatchSize();
+  int h = input.getFrameHeight();
+  int w = input.getFrameWidth();
+  if (h != 0) {
+    height = h;
+  }
+  if (w != 0) {
+    width = w;
+  }
+}
+
+void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
+  output_.setFrameHeight(height);
+  output_.setFrameWidth(width);
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].setFrameHeight(height);
+    outputOtherDevice_[i].setFrameWidth(width);
+  }
+}
+
+void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                                  const MatrixPtr& mat,
+                                  memory::primitive_desc pd) {
+  dnn = nullptr;
+  if (mat == nullptr) {
+    return;
+  }
+  dnn = MKLDNNMatrix::create(pd, mat);
+}
+
+void MKLDNNLayer::resetInValue(
+    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+  cvtInVal_ = nullptr;
+  extInVal_ = nullptr;
+  in = nullptr;
+  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+  if (in == nullptr || in->getFormat() == format::nc) {
+    in = MKLDNNMatrix::create(extPD, inMat);
+  }
+  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+  if (in->getFormat() == format::nc) {
+    CHECK(ih_ == 1 && iw_ == 1);
+  }
+  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+    return;
+  }
+  // need create reorder
+  in = MKLDNNMatrix::create(*intPD);
+  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
+  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+  CHECK(cvtInVal_) << "should not be emptry";
+}
+
+void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
+                                memory::primitive_desc intPD) {
+  cvtOutVal_ = nullptr;
+  out = MKLDNNMatrix::create(intPD, output_.value);
+  extOutVal_ = out;
+  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+  extOutVal_ = MKLDNNMatrix::create(
+      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+  CHECK(cvtOutVal_) << "should not be empty";
+}
+
+void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
+                              memory::primitive_desc intPD) {
+  cvtInGrad_ = nullptr;
+  extInGrad_ = nullptr;
+  in = nullptr;
+  LayerPtr& input = inputLayers_[0];
+  if (input->getOutputGrad() == nullptr) {
+    // no need input grad
+    return;
+  }
+  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+      << "only support input is MKLDNN layer or only have one output layer";
+  // when input is a mkldnn branch node,
+  // this layer will save input grad to a internal buffer,
+  // and the mkldnn input layer will merge them to actual prev->output_.grad
+  const MatrixPtr& inMat =
+      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+  in = MKLDNNMatrix::create(intPD, inMat);
+  Argument& arg = input->getOutput(this->getName());
+  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  CHECK(inVal_);
+  CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
+  if (inputIsOnlyMKLDNN()) {
+    return;
+  }
+
+  extInGrad_ = in;
+  if (isPaddleFormat(extInGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  // TODO(TJ): add macro definition to simplify it
+  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+      << "should have external input value and the format must be nchw(nc)";
+  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  in = MKLDNNMatrix::create(intPD);
+  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+  CHECK(cvtInGrad_);
+}
+
+void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
+                               memory::primitive_desc intPD) {
+  cvtOutGrad_ = nullptr;
+  extOutGrad_ = nullptr;
+  out = nullptr;
+  MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(intPD, outMat);
+  resetMergeGrad(out);
+  if (outputIsOnlyMKLDNN()) {
+    return;
+  }
+  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+  extOutGrad_ = out;
+  if (isPaddleFormat(extOutGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+      << "should have external output value and the format must be nchw(nc)";
+  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
+  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+      << "should have internal output value and primitive desc must equal";
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+  CHECK(cvtOutGrad_);
+}
+
+void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
+  mergeGrad_ = nullptr;
+  pipelineMergeGrad_.clear();
+  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+    // do not merge when output is not all MKLDNN or only one output
+    return;
+  }
+  CHECK(out) << "should have reset internal ouput grad";
+  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  std::vector<primitive::at> srcs;
+  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+    MKLDNNMatrixPtr src =
+        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+    CHECK(src) << "should be MKLDNNMatrix";
+    auto srcDims = src->getDims();
+    auto dstDims = out->getDims();
+    CHECK_EQ(srcDims.size(), dstDims.size());
+    for (size_t i = 0; i < srcDims.size(); ++i) {
+      CHECK_EQ(srcDims[i], dstDims[i]);
+    }
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
+                      << ", format " << src->getFormat();
+    srcPDs.push_back(src->getPrimitiveDesc());
+    srcs.push_back(*src);
+  }
+
+  // TODO(TJ): remove me when mkldnn sum support different formats
+  for (size_t i = 1; i < srcPDs.size(); ++i) {
+    CHECK(srcPDs[0] == srcPDs[i]);
+  }
+  tmpOutGrad_ = out;
+  tmpCvt_ = nullptr;
+  if (out->getPrimitiveDesc() != srcPDs[0]) {
+    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
+    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+    CHECK(tmpCvt_);
+    pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  auto sumPD =
+      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4e2753eba2350d2c3df81b57fe98270a3c38cb24..9b54c95b55cc9b503de5ff527ac983eb4752ddb0 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -58,11 +58,31 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // MKLDNNMatrixPtr with internal format
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  MKLDNNMatrixPtr extInVal_;
+  MKLDNNMatrixPtr extInGrad_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
   MKLDNNMatrixPtr wgtVal_;
   MKLDNNMatrixPtr wgtGrad_;
   MKLDNNMatrixPtr biasVal_;
@@ -91,6 +111,7 @@ public:
         oh_(0),
         ow_(0),
         needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
         stream_(nullptr),
         fwd_(nullptr),
@@ -99,92 +120,9 @@ public:
 
   ~MKLDNNLayer() {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
-    CHECK(!useGpu_) << "Do not support GPU yet";
-
-    // set device id before Layer::init
-    setDevice(MKLDNN_DEVICE);
-    // change param device to MKLDNN device
-    setParamsDevice(MKLDNN_DEVICE, parameterMap);
-    if (!Layer::init(layerMap, parameterMap)) {
-      return false;
-    }
-    setOutputMap();
-    checkCPUOutputsNumber();
-
-    stream_.reset(new MKLDNNStream());
-    engine_ = CPUEngine::Instance().getEngine();
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    passType_ = passType;
-
-    {
-      REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-      CHECK(!inputLayers_.empty());
-      copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
-      if (inputElemenCnt_ != elemenCnt) {
-        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-        // reset when input total sizes changed, not only the batchsize
-        inputElemenCnt_ = elemenCnt;
-        pipelineFwd_.clear();
-        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        convertWeightsFromPaddle();
-        needResetBwd_ = true;
-      }
-
-      if (inputLayers_[0]->getType() == "data") {
-        updateInputData();
-      }
-
-      if (!outputOnlyMKLDNN_) {
-        clearGrads();
-      }
-      stream_->submit(pipelineFwd_);
-    }
-
-    /* activation */ {
-      REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (needResetBwd_) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-      pipelineBwd_.clear();
-      pipelineMergeGrad_.clear();
-      mergeGrad_ = nullptr;
-      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-      needResetBwd_ = false;
-    }
-
-    // merge grad must before backward activation
-    if (mergeGrad_) {
-      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
-      stream_->submit(pipelineMergeGrad_);
-    }
-    {
-      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-      backwardActivation();
-    }
-    {
-      REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-      stream_->submit(pipelineBwd_);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      updateWeights(callback);
-    }
-  }
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
 
   /**
    * reshape the input image sizes
@@ -195,7 +133,7 @@ public:
       int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
 
   /**
-   * reset the mkldnn forward primitve and memory
+   * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
@@ -205,7 +143,7 @@ public:
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
-   * reset the mkldnn backward primitve and memory for mkldnn fc
+   * reset the mkldnn backward primitve and memories
    * only would be called when needed
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
@@ -214,12 +152,6 @@ public:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out) = 0;
 
-  /**
-   * Update input value data when input layer is "data" type.
-   * Since the input value data address might be changed.
-   */
-  virtual void updateInputData() {}
-
   /**
    * Update weights and biases if necessary.
    */
@@ -246,131 +178,78 @@ protected:
   /**
    * reshape the input image sizes and input batchsize
    */
-  virtual void reshapeInput(int& batchsize, int& height, int& width) {
-    const Argument& input = inputLayers_[0]->getOutput();
-    batchsize = input.getBatchSize();
-    int h = input.getFrameHeight();
-    int w = input.getFrameWidth();
-    if (h != 0) {
-      height = h;
-    }
-    if (w != 0) {
-      width = w;
-    }
-  }
+  void reshapeInput(int& batchsize, int& height, int& width);
 
   /**
    * reshape output image sizes
    */
-  virtual void reshapeOutput(size_t height, size_t width) {
-    output_.setFrameHeight(height);
-    output_.setFrameWidth(width);
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].setFrameHeight(height);
-      outputOtherDevice_[i].setFrameWidth(width);
-    }
-  }
+  void reshapeOutput(size_t height, size_t width);
 
   /**
-   * reset the output grad matrix from primitive desc.
-   * and reset the merge grad primitive if needed.
-   * note: when this layer has serval outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
    */
-  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
-                            mkldnn::memory::primitive_desc pd) {
-    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
-    mergeGrad_ = nullptr;
-    pipelineMergeGrad_.clear();
-    out = MKLDNNMatrix::create(output_.grad, pd);
-    if (outputMap_.size() <= 1) {
-      return;
-    }
-    std::vector<double> scales(outputMap_.size(), 1.0);
-    std::vector<mkldnn::memory::primitive_desc> srcPDs;
-    std::vector<mkldnn::primitive::at> srcs;
-    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
-      MKLDNNMatrixPtr src =
-          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
-      CHECK(src) << "should be MKLDNNMatrix";
-      auto srcDims = src->getDims();
-      auto dstDims = out->getDims();
-      CHECK_EQ(srcDims.size(), dstDims.size());
-      for (size_t i = 0; i < srcDims.size(); ++i) {
-        CHECK_EQ(srcDims[i], dstDims[i]);
-      }
-      srcPDs.push_back(src->getPrimitiveDesc());
-      srcs.push_back(*src);
-    }
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
 
-    // TODO(TJ): remove me when mkldnn sum support different formats
-    for (size_t i = 1; i < srcPDs.size(); ++i) {
-      CHECK(srcPDs[0] == srcPDs[i]);
-    }
-    tmpOutGrad_ = nullptr;
-    tmpCvt_ = nullptr;
-    if (out->getPrimitiveDesc() != srcPDs[0]) {
-      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
-      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-      CHECK(tmpCvt_);
-      pipelineMergeGrad_.push_back(*tmpCvt_);
-    } else {
-      tmpOutGrad_ = out;
-    }
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
 
-    auto sumPD = mkldnn::sum::primitive_desc(
-        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
-    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
-  }
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
 
   /**
-   * reset input grad from primitive desc.
-   * this function is avaiable for input is only mkldnn
-   * or input do not care cpu device
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
    */
-  virtual void resetInGrad(MKLDNNMatrixPtr& in,
-                           mkldnn::memory::primitive_desc pd) {
-    LayerPtr& input = inputLayers_[0];
-    const MatrixPtr& grad =
-        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
-    in = MKLDNNMatrix::create(grad, pd);
-    Argument& arg = input->getOutput(this->getName());
-    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  }
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
 
   /**
-   * print info about sizes
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
    */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
 
   /**
-   * Print the mkldnn memory format flow of value
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
    */
-  virtual void printValueFormatFlow() {
-    if (inVal_ && outVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
-                        << outVal_->getFormat();
-    }
-  }
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
 
+protected:
   /**
-   * Print the mkldnn memory format flow of grad
+   * Set deviceId of this layer.
    */
-  virtual void printGradFormatFlow() {
-    if (inGrad_ && outGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
-                        << outGrad_->getFormat();
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
     }
   }
 
-protected:
   /**
    * If input only has MKLDNN device.
    * Otherwise, only support the previous layer using CPU device.
@@ -380,7 +259,6 @@ protected:
     if (prevDevice == MKLDNN_DEVICE) {
       return true;
     } else {
-      // do not support GPU yet
       CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
       return false;
     }
@@ -400,9 +278,61 @@ protected:
   }
 
   /**
-   * Set deviceId of this layer.
+   * print info about sizes
    */
-  void setDevice(int id) { deviceId_ = id; }
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    if (extInVal_) {
+      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
+    }
+    if (inVal_) {
+      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    if (inGrad_) {
+      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    }
+    if (extInGrad_) {
+      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
 
 private:
   /**
@@ -449,6 +379,19 @@ private:
     }
   }
 
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 0e53e2d1b7e6691909955eeacd345981a9960ec6..6e89260f49979d4edb4da138507a73dc2bf120de 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdPD(pd, in, out);
 
   resetBwdPipeline(pipeline, pd, in, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNPoolLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetInValue(in);
 
-  resetOutValue(out);
-}
-
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-}
-
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  CHECK(inVal_) << "Should reset input value first";
   memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
-      output_.value, outDims, inVal_->getFormat(), engine_);
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr in,
                                  MKLDNNMatrixPtr out) {
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
@@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
 }
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetInGrad(in);
-}
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    cpuOutGrad_ = MKLDNNMatrix::create(
-        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_) << "should not be emptry";
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr& in,
                                  MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
   memory::dims padR = getPaddingR();
-  CHECK(in);
   CHECK(out);
   auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                 in->getMemoryDesc(),
@@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
+  if (pd == nullptr) {
+    return;
   }
 
   bwdData_ =
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 891e15a7efcdd2e54f61352efc1ba7345b91c76b..c5ec87828bfb28b4502b4ec6b47287089c514204 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -38,13 +38,6 @@ protected:
   // pooling_avg or pooling_max
   mkldnn::algorithm poolAlgo_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -74,8 +67,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@@ -90,8 +81,6 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr out);
@@ -106,8 +95,6 @@ protected:
    *                     reset pipeline.
    */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
                   MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 3bf6a9e176cc1235aa5ddefcedd4253e6afc1342..0a19fe23336ea943cb8a572dc40f8c0fbbd7236a 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
     parameters_[REF][i]->randomize();
     dnnValue->copyFrom(*refValue);
 
-    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
     printVector(dnnValue);
   }
 }
@@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
     dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
     dataLayers_[DNN][i]->getOutputValue()->copyFrom(
         *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(lvl_) << "Input " << i << " data:";
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
     printMatrix(dataLayers_[REF][i]->getOutputValue());
   }
 }
@@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
   refLayer_->getOutputGrad()->randomizeUniform();
   dnnLayer_->getOutput(CPU_DEVICE)
       .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(lvl_) << "Random Backward Input, TopDiff: ";
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
   printMatrix(refLayer_->getOutputGrad());
 }
 
 void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_ALL) << "Check Forward";
+  VLOG(MKLDNN_TESTS) << "Check Forward";
   printTopDatas();
   double delta =
       compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
@@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
 }
 
 void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_ALL) << "Check Backward Data";
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
   // TODO(TJ): uncomment me when batch norm ready
   // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
   for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
     const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
     const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
     printMatrix(dnnDiff);
-    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
     printMatrix(refDiff);
 
     double delta = compareMatrix(dnnDiff, refDiff);
@@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
 }
 
 void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_ALL) << "Check Backward Weight";
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
   CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
   vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
   saveWgt(parameters_[DNN], dnnWgts);
@@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
   for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
     const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
     const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
     printVector(dnn);
-    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
     printVector(ref);
 
     double delta = compareVector(dnn, ref);
@@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
   }
 
   for (int n = 0; n < NUM; ++n) {
-    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
     printMatrix(testLayers_[n]->getOutputValue());
   }
 }
@@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
 
   std::ostringstream ostr;
   m->print(ostr);
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 void MKLDNNTester::printVector(const VectorPtr& v) {
@@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
 
   std::ostringstream ostr;
   v->print(ostr, v->getSize());
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 double MKLDNNTester::getDelta(const real* d1,
@@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
   UpdateCallback updateCallback = [](Parameter* para) {
     auto& grad = para->getBuf(PARAMETER_GRADIENT);
     auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-3;
+    real lr = 1e-2;
     value->add(*grad, lr);
     grad->zeroMem();
   };
@@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
                        size_t batchSize,
                        size_t inputImgH,
                        size_t inputImgW,
+                       bool printDetails,
                        size_t iter,
-                       float epsilon,
-                       bool log,
-                       int level) {
+                       float epsilon) {
   CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
         dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
       << "should be MKLDNN layer or MKLDNN activation";
@@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
 
   ih_ = inputImgH;
   iw_ = inputImgW;
+  log_ = printDetails;
   iter_ = iter;
   eps_ = epsilon;
-  log_ = log;
-  lvl_ = level;
 
   // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
   reset(dnn, ref, batchSize);
@@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
   CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
   CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.outValues.size(); i++) {
     EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
   }
+  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.paraValues.size(); i++) {
     EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
   }
@@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
                                    float eps) {
   DataIn in;
   initArgument(in, configPath, iter);
-
   DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
   getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
   getOutResult(configPath, in, outDnn, true, iter);
 
   compareResult(outCpu, outDnn, eps);
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 51abfcb67e2ec35fe1b0179e742a7d18f08f8a2c..c385d1c72717d120211f167b5c5eb9a557da3714 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -58,8 +58,6 @@ protected:
   size_t iter_;
   /// whether to print out the details
   bool log_;
-  /// vlog level to print the matrix details datas
-  int lvl_;
   /// epsilon
   float eps_;
   /// input image size, default 1
@@ -70,7 +68,6 @@ public:
     iter_ = iter;
     eps_ = epsilon;
     log_ = false;
-    lvl_ = MKLDNN_ALL;
   }
 
   ~MKLDNNTester() {}
@@ -81,10 +78,9 @@ public:
            size_t batchSize,
            size_t inputImgH = 1,
            size_t inputImgW = 1,
+           bool printDetails = false,
            size_t iter = 3,
-           float epsilon = 1e-4,
-           bool log = false,
-           int level = MKLDNN_ALL);
+           float epsilon = 1e-4);
   static void runBranchesTest(const std::string& configPath,
                               size_t iter = 3,
                               float eps = 1e-4);
diff --git a/paddle/gserver/tests/mkldnn_branches_fc.conf b/paddle/gserver/tests/mkldnn_branches_fc.conf
new file mode 100644
index 0000000000000000000000000000000000000000..fb85425c2b63c7604d636e2b0c5d20d91fb5de1b
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_fc.conf
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_fc(input, group_name):
+  out1 = fc_layer(input=input,
+            name=group_name+'_fc1',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+
+  out2 = fc_layer(input=input,
+            name=group_name+'_fc2',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation())
+
+pool = img_pool_layer(input=conv,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+a1, a2 = two_fc(input=pool, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_fc(input=pool, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/mkldnn_branches_pool.conf b/paddle/gserver/tests/mkldnn_branches_pool.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ca17c74752ab0777a69f818d9f43275a6140cb4c
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_pool.conf
@@ -0,0 +1,60 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_pool(input, group_name):
+  out1 = img_pool_layer(input=input,
+            name=group_name+'_pool1',
+            pool_size=3,
+            stride=2,
+            padding=0,
+            pool_type=MaxPooling())
+
+  out2 = img_pool_layer(input=input,
+            name=group_name+'_pool2',
+            pool_size=5,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation())
+
+pool = img_pool_layer(input=conv,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=AvgPooling())
+
+a1, a2 = two_pool(input=pool, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_pool(input=pool, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 3571fbb9e335fc6652bdbfc3f9e35beabda5044f..6cb4ca5e08eab5b979e404c9e09dcfec11086c22 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -250,7 +250,7 @@ TEST(MKLDNNActivation, Activations) {
 
 DECLARE_string(config_args);
 TEST(MKLDNNLayer, branches) {
-  std::vector<std::string> cases = {"conv"};
+  std::vector<std::string> cases = {"conv", "pool", "fc"};
   for (auto name : cases) {
     std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
     for (auto channels : {2, 32}) {
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 2e6225519f4681238f4b40fb33764ead4a16b24a..0d0fe476ff5eac8bf8ad1c9fe09b32c1a8f73ebc 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -51,7 +51,10 @@ def test_sparse_non_value_no_seq(setting, filename):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
 
 
-@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(input_types=[
+    sparse_float_vector(
+        30000, seq_type=SequenceType.NO_SEQUENCE)
+])
 def test_sparse_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0778bb63b7b3bca9b3d2647ca43dad72d783950a..21a8f73c3e650d4b3c3b86247594cd965f4ead35 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,7 +18,7 @@ using namespace mkldnn;  // NOLINT
 
 namespace paddle {
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
   memory::desc md = pd.desc();
   size_t ndims = md.data.ndims;
   int* dims = md.data.dims;
@@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
-                                     memory::dims dims,
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
                                      memory::format fmt,
                                      engine& eg,
+                                     MatrixPtr m,
                                      mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
 }
 
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index c843115eb9a5be50d6ff873f1510844228c9d89f..fe755d096da9713e39581a909e5d21aa93d69f0f 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -40,24 +40,37 @@ public:
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
    */
-  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
 
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory details info
    */
   static MKLDNNMatrixPtr create(
-      MatrixPtr m,
       mkldnn::memory::dims dims,
       mkldnn::memory::format fmt,
       mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
   /**
    * Create Memory descriptor.
    * default with any format and f32 dtype
    */
   static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
       const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
       const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
     return mkldnn::memory::desc(dims, dtype, fmt);
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 037bb49abc6c272eed2d27ea5d8425866ef9a1d5..e0a00ecaf04335800eab9e2e5a03628a2ce2ca8d 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 0ca9ef941d4cb15619caea2b6baed197e4b15e5a..54e6ab99dc8c8ff1afbc636e6595cd67fb64eccf 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -21,9 +21,9 @@ namespace paddle {
 namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
-                                   const int* labeldata, float* accuracy) {
+template <typename T, int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
+                                   const T* labeldata, float* accuracy) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
     // FIXME(typhoonzero): only support indices currently
     // if add support for output values, how to detect the data type?
-    const int* inference_data = inference->data<int>();
-    const int* label_data = label->data<int>();
+    const T* inference_data = inference->data<T>();
+    const T* label_data = label->data<T>();
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
     size_t num_samples = inference->dims()[0];
@@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
       return;
     }
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+    AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         1, PADDLE_CUDA_NUM_THREADS, 0,
         reinterpret_cast<const platform::CUDADeviceContext&>(
             ctx.device_context())
@@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(accuracy,
-                       paddle::operators::AccuracyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+                       paddle::operators::AccuracyOpCUDAKernel<double>,
+                       paddle::operators::AccuracyOpCUDAKernel<int>,
+                       paddle::operators::AccuracyOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index e3db70ea129880434add21e71d15e5129c4551bd..3572de06bd60f7979e3bfbf39856b04942ce81c0 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -43,10 +43,6 @@ class AdamOp : public framework::OperatorWithKernel {
                    "Output(Moment1Out) of AdamOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
                    "Output(Moment2Out) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
-                   "Output(Beta1PowOut) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Beta2PowOut"),
-                   "Output(Beta2PowOut) of AdamOp should not be null.");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
@@ -72,8 +68,6 @@ class AdamOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
     ctx->SetOutputDim("Moment2Out", param_dims);
-    ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
-    ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
   }
 };
 
@@ -92,8 +86,6 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("ParamOut", "(Tensor) Output parameter");
     AddOutput("Moment1Out", "(Tensor) Output first moment");
     AddOutput("Moment2Out", "(Tensor) Output second moment");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
@@ -121,10 +113,8 @@ Adam updates:
 
 moment1_out = beta1 * moment1 + (1 − beta1) * grad
 moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-beta1_pow_out = beta1_pow * beta1
-beta2_pow_out = beta2_pow * beta2
 learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
+                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
 param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
 
 References:
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 789c2f14b32478bf9ddc967fc5725bcf65ed2146..45938006db1231a7a134964d729df6ca114d4dbe 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -26,14 +26,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
     auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
-    auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
-    auto beta2_pow_out_tensor = ctx.Output<framework::Tensor>("Beta2PowOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
-    beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
-    beta2_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
 
     float beta1 = ctx.Attr<float>("beta1");
     float beta2 = ctx.Attr<float>("beta2");
@@ -56,18 +52,13 @@ class AdamOpKernel : public framework::OpKernel<T> {
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
     auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto beta1_pow_out =
-        framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
-    auto beta2_pow_out =
-        framework::EigenVector<T>::Flatten(*beta2_pow_out_tensor);
     auto place = ctx.GetEigenDevice<Place>();
 
     moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
     moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
-    beta1_pow_out.device(place) = beta1_pow * beta1;
-    beta2_pow_out.device(place) = beta2_pow * beta2;
+
     // All of these are tensors of 1 element
-    auto lr_t = lr * (1 - beta2_pow_out).sqrt() / (1 - beta1_pow_out);
+    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
     // Eigen does not support automatic broadcast
     // Get dimensions of moment vector to broadcast lr_t
     Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index e848333ef8a819648cc3056ae2f4a0e33fc58405..ff2565774115571166712b03c8990e5bf8de12a5 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -41,8 +41,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
                    "Output(MomentOut) of AdamaxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
                    "Output(InfNormOut) of AdamaxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
-                   "Output(Beta1PowOut) of AdamaxOp should not be null.");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
@@ -64,7 +62,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
     ctx->SetOutputDim("InfNormOut", param_dims);
-    ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
   }
 };
 
@@ -86,7 +83,6 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("InfNormOut",
               "(Tensor) "
               "Output exponentially weighted infinity norm");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
@@ -113,8 +109,7 @@ Adamax updates:
 
 moment_out = beta1 * moment + (1 - beta1) * grad
 inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-beta1_pow_out = beta1_pow * beta1
-learning_rate_t = learning_rate/(1 - beta1_pow_out)
+learning_rate_t = learning_rate/(1 - beta1_pow)
 param_out = param - learning_rate_t * moment_out/inf_norm_out
 
 The original paper does not have an epsilon attribute.
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index 9677b1bb786002aadfaeb571b2ba2e6aa2481ca5..2c99832ec08e9c1d9b5458c467d5238f9b1b3c37 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -26,12 +26,10 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
     auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-    auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
     inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-    beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
 
     float beta1 = ctx.Attr<float>("beta1");
     float beta2 = ctx.Attr<float>("beta2");
@@ -53,15 +51,12 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
     auto inf_norm_out =
         framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto beta1_pow_out =
-        framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
     auto place = ctx.GetEigenDevice<Place>();
 
     moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
     inf_norm_out.device(place) =
         grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    beta1_pow_out.device(place) = beta1_pow * beta1;
-    auto lr_t = lr / (1 - beta1_pow_out);
+    auto lr_t = lr / (1 - beta1_pow);
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
     param_out.device(place) =
         param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 2d029394dd97a9c33c9c57fd3565345139cdff92..f80204c6833d6436f2cf21610beea45b36787eea 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -27,8 +27,8 @@ class ClipOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ClipOp should not be null.");
     auto x_dims = ctx->GetInputDim("X");
-    auto max = Attr<float>("max");
-    auto min = Attr<float>("min");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h
index bd1734879ef2569bfc7c3bef21677d3b0dc49a78..f629728f68d65ce81b4910cae7f89ab06d6d94b8 100644
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
     int in_step = input_channels / groups;
     int out_step = output_channels / groups;
     for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
       for (int g = 0; g < groups; g++) {
         // im2col
-        Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
         im2col(context.device_context(), in_slice, col, strides[0], strides[1],
                paddings[0], paddings[1]);
 
         // gemm
-        Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
         math::matmul<Place, T>(context.device_context(), filter_slice, false,
                                col_matrix, false, T(1.0), &out_slice, T(0.0));
       }
@@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            input_grad->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
         for (int g = 0; g < groups; g++) {
           // gemm
           Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice =
-              filter.Slice<T>(g * out_step, (g + 1) * out_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
           math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                  out_grad_slice, false, T(1.0), &col_matrix,
                                  T(0.0));
 
           // col2im
           Tensor in_grad_slice =
-              in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
           col2im(context.device_context(), in_grad_slice, col, strides[0],
                  strides[1], paddings[0], paddings[1]);
         }
@@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
         for (int g = 0; g < groups; g++) {
           // im2col
           Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
           im2col(context.device_context(), in_slice, col, strides[0],
                  strides[1], paddings[0], paddings[1]);
 
           // gemm
           Tensor filter_grad_slice =
-              filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
           math::matmul<Place, T>(context.device_context(), out_grad_slice,
                                  false, col_matrix, true, T(1.0),
                                  &filter_grad_slice, T(1.0));
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
index 03f33e28d49fdaeccb9b6266359e0b41a1cb847f..a0b06ac1dc305bc899f9abaafcc980a6150ecda9 100644
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -23,6 +23,7 @@ using framework::Scope;
 using framework::TensorArray;
 using framework::LoDTensor;
 using framework::Variable;
+using framework::OperatorBase;
 using framework::DySeqMetaBatch;
 
 namespace detail {
@@ -43,72 +44,72 @@ inline void CreateVariables(Scope& scope,
  * be reordered, but the RNN op should not change the `boot_state` as an input
  * variable's content.
  */
-template <typename T>
-inline void ReorderBootState(const DySeqMetaBatch& metas,
-                             const LoDTensor& boot_state, LoDTensor* tensor,
-                             const platform::Place& dst_place) {
+inline void ReorderInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& boot_state, LoDTensor* tensor,
+                                const platform::Place& dst_place) {
   for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice<T>(seq_id, seq_id + 1);
+    auto slice = tensor->Slice(seq_id, seq_id + 1);
     auto boot_slice =
-        boot_state.Slice<T>(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
     // TODO(superjom) pass in device context as an argument
-    slice.template CopyFrom<T>(boot_slice, dst_place,
-                               platform::CPUDeviceContext());
+    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
   }
 }
 
-}  // namespace detail
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name = DynamicRecurrentOp::kArgName;
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "the inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.boot_memories, "variables to initialize memories.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.pre_memories,
-                                      "names of pre-memories");
-    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
-
-    AddComment("This is a RNN operator for varience-length sequences.");
+inline void RestoreInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& tensor, LoDTensor* boot_state,
+                                const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor.Slice(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
   }
-};
+}
 
-void DynamicRecurrentOp::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  cache_.Init(kArgName, *this, scope, &arg_);
+}  // namespace detail
+
+// Implementation for forward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kForward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
   SplitInputs();
   CreateScopes();
   WriteStepInputs();
   InitStates();
   WriteStepOutputs();
+  RunSteps();
+  ConcatOutputs();
+}
 
-  // call stepnet in all the time steps
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& step_scope = cache_.GetScope(step);
-    stepnet_->Run(step_scope, dev_ctx);
+// Implementation for backward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kBackward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
+  SplitInputs();
+  WriteStepInputs();
+  InitStates();
+  WriteStepOutputs();
+  RunSteps();
+  // copy boot-states' gradients back.
+  for (const auto& state : arg_.states) {
+    ExportInitialStateGradient(state);
   }
 
   ConcatOutputs();
 }
 
-void DynamicRecurrentOp::SplitInputs() const {
+void RNNAlgorithm::SplitInputs() {
   // TODO(superjom) make level a config
   // TODO(superjom) check all the inputs has the same LoD
   int level = 0;
-  for (const auto& item : cache_.inlinks) {
+  for (const auto& item : cache_.inputs) {
     const auto& var = item.second;
     const auto& tensor = var->Get<LoDTensor>();
     TensorArray& ta = step_inputs_[item.first];
@@ -125,8 +126,8 @@ void DynamicRecurrentOp::SplitInputs() const {
   }
 }
 
-void DynamicRecurrentOp::WriteStepInputs() const {
-  for (const auto& item : cache_.inlinks) {
+void RNNAlgorithm::WriteStepInputs() {
+  for (const auto& item : cache_.inputs) {
     auto ta_it = step_inputs_.find(item.first);
     PADDLE_ENFORCE(ta_it != step_inputs_.end(),
                    "step_inputs_ not compatible with memory set");
@@ -138,20 +139,20 @@ void DynamicRecurrentOp::WriteStepInputs() const {
       if (var == nullptr) {
         var = step_scope.Var(item.first);
       }
-      var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
+      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
     }
   }
 }
 
-void DynamicRecurrentOp::WriteStepOutputs() const {
+void RNNAlgorithm::WriteStepOutputs() {
   // initialize step outputs
-  for (const auto& item : cache_.outlinks) {
+  for (const auto& item : cache_.outputs) {
     step_outputs_.emplace(item.first, TensorArray());
   }
   PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
 }
 
-void DynamicRecurrentOp::CreateScopes() const {
+void RNNAlgorithm::CreateScopes() {
   PADDLE_ENFORCE_GT(cache_.num_steps, 0);
   // resize scopes
   size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
@@ -160,19 +161,19 @@ void DynamicRecurrentOp::CreateScopes() const {
   }
 
   // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first");
-  std::vector<std::string> memories;
-  std::vector<std::string> pre_memories;
-  std::vector<std::string> stepnet_outputs;
-  std::transform(arg_.memories.begin(), arg_.memories.end(),
-                 std::back_inserter(memories),
-                 [](const rnn::MemoryAttr& m) { return m.var; });
-  std::transform(arg_.memories.begin(), arg_.memories.end(),
-                 std::back_inserter(pre_memories),
-                 [](const rnn::MemoryAttr& m) { return m.pre_var; });
-  for (const auto& item : stepnet_->Outputs()) {
+  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
+  std::vector<std::string> states;
+  std::vector<std::string> ex_states;
+  std::vector<std::string> step_unit_outputs;
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(states),
+                 [](const rnn::StateAttr& m) { return m.var; });
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(ex_states),
+                 [](const rnn::StateAttr& m) { return m.pre_var; });
+  for (const auto& item : step_unit_->Outputs()) {
     for (const auto& var : item.second) {
-      stepnet_outputs.push_back(var);
+      step_unit_outputs.push_back(var);
     }
   }
 
@@ -180,13 +181,13 @@ void DynamicRecurrentOp::CreateScopes() const {
     auto& scope = cache_.GetScope(step);
     detail::CreateVariables(scope, arg_.inlinks);
     detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, memories);
-    detail::CreateVariables(scope, pre_memories);
-    detail::CreateVariables(scope, stepnet_outputs);
+    detail::CreateVariables(scope, states);
+    detail::CreateVariables(scope, ex_states);
+    detail::CreateVariables(scope, step_unit_outputs);
   }
 }
 
-void DynamicRecurrentOp::ConcatOutputs() const {
+void RNNAlgorithm::ConcatOutputs() {
   // TODO(superjom) transform this to a config
   int level = 0;
   for (size_t step = 0; step < cache_.num_steps; step++) {
@@ -199,31 +200,45 @@ void DynamicRecurrentOp::ConcatOutputs() const {
       item.second.WriteShared(step, *tensor);
     }
   }
-  // the inlinks' lods should be the same, so randomly get one lod.
+  // the inputs' lods should be the same, so randomly get one lod.
   const auto& some_lod =
       cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
   const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
   for (auto& item : step_outputs_) {
     auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith<value_type>(tensor);
+    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
+    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
+  }
+}
+
+void RNNAlgorithm::RunSteps() {
+  if (IsBackward()) {
+    // call stepnet in all the time steps reversely
+    for (int step = cache_.num_steps - 1; step >= 0; step--) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
+  } else {
+    for (size_t step = 0; step < cache_.num_steps; step++) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
   }
 }
 
-void DynamicRecurrentOp::InitStates() const {
+void RNNAlgorithm::InitStates() {
   for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& memory : arg_.memories) {
-      CreateState(memory, step);
-      LinkState(memory, step);
+    for (const auto& state : arg_.states) {
+      CreateState(state, step);
+      LinkState(state, step);
     }
   }
 }
 
-void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
-                                     size_t step) const {
+void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
   auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, memory.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var);
+  auto& state = *cache_.GetTensor(scope, state_attr.var);
+  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
 
   size_t num_instances =
       step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
@@ -232,56 +247,79 @@ void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
 
   state.Resize(dims);
   state.mutable_data<value_type>(platform::CPUPlace());
-  states_[memory.var].WriteShared(step, state);
+  states_[state_attr.var].WriteShared(step, state);
 }
 
-void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
-                                   size_t step) const {
+void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
   auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, memory.pre_var);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+
+  // process the first state's boot-state(the 0-step in forward mode or the
+  // last step in backward mode)
+  // Only forward mode need to link the boot-state to the `pre-state` in first
+  // time step. In backward mode, need to copy the gradient of `pre-state` in
+  // first time step to the gradient of `boot-state`.
+  if (step == 0 && IsForward()) {
+    LinkInitialState(state);
+  } else {
+    size_t num_instances =
+        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
+    // shink and share from previous state
+    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
+    state_pre.ShareDataWith(shrinked_pre_state);
+  }
+}
 
+void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
   // all the step_inputs' metas should be the same, just randomly select one
   // and get the dyseq meta.
   const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+  auto& scope = cache_.GetScope(0);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state->mutable_data<float>(platform::CPUPlace());
+  // allocate state
+  state_pre.Resize(pre_state->dims());
+  state_pre.mutable_data<value_type>(platform::CPUPlace());
+  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
+                              pre_state->place());
+}
 
-  LoDTensor* pre_state{nullptr};
-  if (step == 0) {
-    pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var);
-    pre_state->mutable_data<float>(platform::CPUPlace());
-    // allocate memory
-    state_pre.Resize(pre_state->dims());
-    state_pre.mutable_data<value_type>(platform::CPUPlace());
-    detail::ReorderBootState<value_type>(some_meta, *pre_state, &state_pre,
-                                         pre_state->place());
-  } else {
-    pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var);
-  }
+void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  auto& scope = cache_.GetScope(0);
 
-  // shink and share from previous state
-  auto shrinked_pre_state = pre_state->Slice<value_type>(0, num_instances);
-  state_pre.ShareDataWith<value_type>(shrinked_pre_state);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state.Resize(state_pre.dims());
+  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
+                              pre_state.place());
 }
 
-void DynamicRecurrentOp::ArgCache::Init(
-    const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op,
-    const paddle::framework::Scope& scope, rnn::Argument* arg) {
+void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
+                                  const paddle::framework::OperatorBase& op,
+                                  const paddle::framework::Scope& scope,
+                                  platform::DeviceContext const* dev_ctx,
+                                  rnn::Argument* arg) {
   this->scope = &scope;
   InitArgument(name, op, arg);
   CacheScopes(scope, *arg);
   CacheInlinks(scope, arg->inlinks);
   CacheOutlinks(scope, arg->outlinks);
+  this->dev_ctx = dev_ctx;
 }
 
-void DynamicRecurrentOp::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                                const OperatorBase& op,
-                                                rnn::Argument* arg) {
+void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
+                                          const OperatorBase& op,
+                                          rnn::Argument* arg) {
   rnn::InitArgument(name, arg, op, false /*is_grad*/);
 }
 
-void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope,
-                                               const rnn::Argument& arg) {
+void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
+                                         const rnn::Argument& arg) {
   auto scopes_var = scope.FindVar(arg.step_scopes);
   PADDLE_ENFORCE(scopes_var != nullptr,
                  "the step_scopes output argument [%s] should be created first "
@@ -290,45 +328,85 @@ void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope,
   this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
 }
 
-void DynamicRecurrentOp::ArgCache::CacheInlinks(
+void RNNAlgorithm::ArgCache::CacheInlinks(
     const Scope& scope, const std::vector<std::string>& names) {
   for (auto name : names) {
     auto* var = GetVariable(scope, name);
-    inlinks[name] = var;
+    inputs[name] = var;
   }
 }
 
-void DynamicRecurrentOp::ArgCache::CacheOutlinks(
+void RNNAlgorithm::ArgCache::CacheOutlinks(
     const Scope& scope, const std::vector<std::string>& names) {
   for (auto name : names) {
     auto* var = GetVariable(scope, name);
-    outlinks[name] = var;
+    outputs[name] = var;
   }
 }
 
-Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope,
-                                                    const std::string& name) {
+Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
+                                              const std::string& name) {
   auto* var = scope.FindVar(name);
   PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
   return var;
 }
 
-LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor(
-    const framework::Scope& scope, const std::string& name) {
+LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
+                                             const std::string& name) {
   auto* var = GetVariable(scope, name);
   return var->GetMutable<LoDTensor>();
 }
 
-const rnn::ArgumentName DynamicRecurrentOp::kArgName{
-    "step_net", "step_scopes",  "inlinks",      "outlinks",
-    "memories", "pre_memories", "boot_memories"};
+const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
+    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
+                       "states", "ex_states", "initial_states"},
+     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
+                       "inputs@GRAD", "states", "ex_states",
+                       "initial_states@GRAD"}}};
+
+void DynamicRecurrentOp::Run(const framework::Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
 
 void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {}
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
+
+class DynamicRecurrentOpProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name =
+        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
+    // inputs and outputs stored in proto
+    AddInput(name.inlinks,
+             "the inputs that need to be segmented for each step.")
+        .AsDuplicable();
+    AddInput(name.initial_states, "variables to initialize states.")
+        .AsDuplicable();
+
+    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+        .AsDuplicable();
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
+
+    AddComment("This is a RNN operator for varience-length sequences.");
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-    paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker);
+REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
+            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
+            dynamic_recurrent_grad,
+            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
index ec80a1c90eee3a655febe0dd3d6c67c16ec6c64b..5b0548c3a44c9f58838ecc567ee41a587883c26a 100644
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
@@ -27,47 +27,39 @@
 namespace paddle {
 namespace operators {
 
-class DynamicRecurrentOp : public framework::OperatorBase {
+class RNNAlgorithm {
  public:
-  static const rnn::ArgumentName kArgName;
+  enum ComputeMode { kForward = 0, kBackward = 1 };
+  static const std::array<rnn::ArgumentName, 2> kArgNames;
   using value_type = float;
 
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
+  /*
+   * Different `Run` method for forward and backward, `_` is just for template
+   * specifialization.
+   */
+  template <ComputeMode _>
+  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
+           const platform::DeviceContext& dev_ctx);
   /*
    * Split the inputs(LoDTensors) to segments for each time step.
    */
-  void SplitInputs() const;
+  void SplitInputs();
 
   /*
    * Create step-scopes to store temporary outputs in each time steps.
    */
-  void CreateScopes() const;
+  void CreateScopes();
 
   /*
    * Link TensorArray steps to the corresponding variables located in
    * step-scopes.
    */
-  void WriteStepInputs() const;
+  void WriteStepInputs();
 
   /*
    * Write output of each step to the corresponding TensorArray.
    */
-  void WriteStepOutputs() const;
+  void WriteStepOutputs();
 
   /*
    * Initialize the states, each state will have a corresponding pre-state,
@@ -75,54 +67,83 @@ class DynamicRecurrentOp : public framework::OperatorBase {
    * pre-state in the first time step will be initialized with an zero tensor or
    * a tensor in parent scope if is provided.
    */
-  void InitStates() const;
+  void InitStates();
 
   /*
    * Create state variables for each time step.
    */
-  void CreateState(const rnn::MemoryAttr& memory, size_t step) const;
+  void CreateState(const rnn::StateAttr& state, size_t step);
 
   /*
    * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope).
+   * previous time step (scope) by reference.
+   */
+  void LinkState(const rnn::StateAttr& state, size_t step);
+
+  /*
+   * Link the pre-state of the first time step to the `boot-state` in parent's
+   * scope.
+   */
+  void LinkInitialState(const rnn::StateAttr& state);
+
+  /*
+   * Copy the gradient from `pre-state` in the first step-scope to the
+   * `boot-state` in parent's scope.
+   */
+  void ExportInitialStateGradient(const rnn::StateAttr& state);
+
+  /*
+   * Calculate time steps.
    */
-  void LinkState(const rnn::MemoryAttr& memory, size_t step) const;
+  void RunSteps();
 
   /*
    * Concatenate outputs in each time step and generate a LoDTensor.
    */
-  void ConcatOutputs() const;
+  void ConcatOutputs();
+
+  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
+  bool IsForward() const { return mode_ == ComputeMode::kForward; }
+  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
 
   /*
-   * set a stepnet that is created according to a RecurrentOp's stepnet.
+   * set a step unit that is created according to a RecurrentOp's step unit.
    */
-  void SetStepNet(std::unique_ptr<OperatorBase> net) {
-    PADDLE_ENFORCE_NOT_NULL(net);
-    stepnet_ = std::move(net);
+  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
+    PADDLE_ENFORCE_NOT_NULL(step_unit);
+    step_unit_ = std::move(step_unit);
   }
-  const OperatorBase& GetStepNet() const { return *stepnet_; }
+  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
 
   const framework::TensorArray& state(const std::string& name) const {
-    return states_[name];
+    auto it = states_.find(name);
+    PADDLE_ENFORCE(it != states_.end());
+    return it->second;
   }
   const framework::TensorArray& step_input(const std::string& name) const {
-    return step_inputs_[name];
+    auto it = step_inputs_.find(name);
+    PADDLE_ENFORCE(it != step_inputs_.end());
+    return it->second;
   }
   const framework::TensorArray& step_output(const std::string& name) const {
-    return step_outputs_[name];
+    auto it = step_outputs_.find(name);
+    PADDLE_ENFORCE(it != step_outputs_.end());
+    return it->second;
   }
 
  protected:
   struct ArgCache {
     framework::Scope const* scope;
     std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inlinks;
-    std::map<std::string, framework::Variable*> outlinks;
+    std::map<std::string, framework::Variable*> inputs;
+    std::map<std::string, framework::Variable*> outputs;
+    platform::DeviceContext const* dev_ctx;
 
     size_t num_steps{0};
 
-    void Init(const rnn::ArgumentName& name, const OperatorBase& op,
-              const framework::Scope& scope, rnn::Argument* arg);
+    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
+              const framework::Scope& scope,
+              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
 
     framework::Scope& GetScope(size_t index) {
       PADDLE_ENFORCE_LT(index, num_steps);
@@ -133,8 +154,8 @@ class DynamicRecurrentOp : public framework::OperatorBase {
                                     const std::string& name);
 
    private:
-    void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op,
-                      rnn::Argument* arg);
+    void InitArgument(const rnn::ArgumentName& name,
+                      const framework::OperatorBase& op, rnn::Argument* arg);
     void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
     void CacheInlinks(const framework::Scope& scope,
                       const std::vector<std::string>& names);
@@ -145,27 +166,49 @@ class DynamicRecurrentOp : public framework::OperatorBase {
   };
 
  private:
-  std::unique_ptr<OperatorBase> stepnet_;
-  mutable std::map<std::string, framework::TensorArray> states_;
-  mutable std::map<std::string, framework::TensorArray> step_inputs_;
-  mutable std::map<std::string, framework::TensorArray> step_outputs_;
-  mutable std::map<std::string, std::vector<framework::DySeqMeta>>
-      dy_seq_metas_;
-  mutable rnn::Argument arg_;
-  mutable ArgCache cache_;
+  std::unique_ptr<framework::OperatorBase> step_unit_;
+  std::map<std::string, framework::TensorArray> states_;
+  std::map<std::string, framework::TensorArray> step_inputs_;
+  std::map<std::string, framework::TensorArray> step_outputs_;
+  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
+  rnn::Argument arg_;
+  ArgCache cache_;
+  ComputeMode mode_{ComputeMode::kForward};
 
 #ifdef PADDLE_WITH_TESTING
-  friend class DynamicRecurrentOpTestHelper;
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, SplitInputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateCache);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateScopes);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepInputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepOutputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, InitStates);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, ConcatOutputs);
+  // test forward
+  friend class RNNAlgorithmTestHelper;
+  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
+  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
+// TODO(superjom) test backward
 #endif
 };
 
+class DynamicRecurrentOp : public framework::OperatorBase {
+ public:
+  DynamicRecurrentOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  DynamicRecurrentOp(const DynamicRecurrentOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
+};
+
 class DynamicRecurrentGradientOp : public framework::OperatorBase {
  public:
   DynamicRecurrentGradientOp(const std::string& type,
@@ -174,8 +217,16 @@ class DynamicRecurrentGradientOp : public framework::OperatorBase {
                              const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
+  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
index 36f405568d7e4ed9a469c3af7a80192b83142b7a..fff63efb24c70b7e864e2d5b011a22883c13dede 100644
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -43,16 +43,16 @@ LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
   return tensor;
 }
 
-class DynamicRecurrentOpTestHelper : public ::testing::Test {
+class RNNAlgorithmTestHelper : public ::testing::Test {
  protected:
-  const rnn::ArgumentName argname = DynamicRecurrentOp::kArgName;
+  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
 
   virtual void SetUp() override {
     CreateGlobalVariables();
 
     auto op_desc = CreateOpDesc();
     op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
-    dop = dynamic_cast<DynamicRecurrentOp*>(op.get());
+    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
     InitCacheManually();
     InitStepNet();
   }
@@ -63,20 +63,20 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {
     op_desc.set_type("dynamic_recurrent");
 
     OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.boot_memories, {"boot_mem"}, op_desc.add_inputs());
+    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
     OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
     OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
 
-    // set pre-memories
+    // set pre-states
     auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.pre_memories);
+    pre_memories->set_name(argname.ex_states);
     pre_memories->set_type(paddle::framework::AttrType::STRINGS);
     auto pre_memories_item = pre_memories->add_strings();
     *pre_memories_item = "mem@pre";
 
-    // set memories
+    // set states
     auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.memories);
+    memories->set_name(argname.states);
     memories->set_type(paddle::framework::AttrType::STRINGS);
     auto memories_item = memories->add_strings();
     *memories_item = "mem";
@@ -113,32 +113,33 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {
   }
 
   void InitCacheManually() {
-    dop->cache_.Init(DynamicRecurrentOp::kArgName, *dop, scope, &dop->arg_);
+    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
+                     &dop->arg_);
   }
 
   void InitStepNet() {
     std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
     dynamic_cast<NetOp*>(stepnet.get())
         ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inlinks", {"in0"}}, {"boot_memories", {"boot_mem"}}},
-            {{"outlinks", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepNet(std::move(stepnet));
+            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
+            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
+    dop->SetStepUnit(std::move(stepnet));
   }
 
  protected:
-  DynamicRecurrentOp* dop;
+  RNNAlgorithm* dop;
   std::unique_ptr<framework::OperatorBase> op;
   paddle::platform::CPUDeviceContext device_context;
   paddle::framework::Scope scope;
 };
 
-TEST_F(DynamicRecurrentOpTestHelper, CreateCache) {
+TEST_F(RNNAlgorithmTestHelper, CreateCache) {
   const rnn::Argument& arg = dop->arg_;
   ASSERT_EQ(arg.inlinks.size(), 1UL);
   ASSERT_EQ(arg.outlinks.size(), 1UL);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) {
+TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
   dop->SplitInputs();
   auto& in0_ta = dop->step_inputs_["in0"];
   ASSERT_EQ(in0_ta.size(), 4UL);
@@ -153,14 +154,14 @@ TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) {
   EXPECT_EQ(batch3.dims()[0], 1);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, CreateScopes) {
+TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
   dop->SplitInputs();
   dop->CreateScopes();
   ASSERT_EQ(dop->cache_.num_steps, 4UL);
   ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) {
+TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -173,7 +174,7 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) {
   }
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) {
+TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -187,11 +188,12 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) {
   }
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, ConcatOutputs) {
+TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
   // Let's leave this test to python unittest.
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, InitStates) {
+TEST_F(RNNAlgorithmTestHelper, InitStates) {
+  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -208,12 +210,6 @@ TEST_F(DynamicRecurrentOpTestHelper, InitStates) {
 
     auto* boot_state = scope.FindVar("boot_mem");
     ASSERT_TRUE(boot_state != nullptr);
-
-    if (step == 0) {
-      // check pre_state is a reference of boot_state
-      ASSERT_EQ(boot_state->Get<LoDTensor>().data<float>(),
-                pre_state->Get<LoDTensor>().data<float>());
-    }
   }
 }
 
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 3eb97f60b59848d23bcd15ea1e3d2f21b721f6a4..488a35aafc8600bb8bb252fc3a5161c72a2f6df1 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -108,7 +108,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                     "Rank of first input must >= rank of second input.")
 
-  if (x_dims == y_dims || product(y_dims) == 1) {
+  if (x_dims == y_dims) {
     functor f;
     f.template Run<Place, T>(x, y, z, ctx);
     return;
@@ -174,12 +174,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     return;
   }
 
-  if (product(y_dims) == 1) {
-    functor1 f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
   int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
 
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index d742bbe51b678fcdaf54826947d29060bf3e4e0d..0f1722a5383c80ff2ede0801d34f22a80fbc6e52 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -26,8 +26,9 @@ class FeedOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    auto feed_var_name = Input("Input");
+    auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
+
     PADDLE_ENFORCE(feed_var != nullptr,
                    "Cannot find feed_var in scope, feed_var_name is %s",
                    feed_var_name);
@@ -40,18 +41,32 @@ class FeedOp : public framework::OperatorBase {
 
     auto col = Attr<int>("col");
 
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var"
+            << out_name;
+
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
     out_item->set_lod(feed_item.lod());
   }
 };
 
+class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FeedOpInfoMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of feed op");
+    AddOutput("Out", "The output of feed op");
+    AddComment("feed op, it should not be configured by users directly");
+    AddAttr<int>("col", "column of feed");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-// We do not need to register OpInfoMaker,
-// since feed operator will not be used by end users directly
 REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
-                  paddle::framework::EmptyGradOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 55d6ac093959a6e1c11457085a8ebdd8a14adaf3..c1b3d66bac4c703ce78b247aadc2975bb146b5b0 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -27,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
 
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    auto fetch_var_name = Input("Input");
+    auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
                    "Cannot find fetch variable in scope, fetch_var_name is %s",
@@ -51,14 +51,26 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx);
+    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+
+    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
   }
 };
 
+class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FetchOpInfoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fetch op");
+    AddOutput("Out", "The output of fetch op");
+    AddComment("fetch op, it should not be configured by users directly");
+    AddAttr<int>("col", "column of fetch");
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
-// We do not need to register OpInfoMaker,
-// since fetch operator will not be used by end users directly
 REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
-                  paddle::framework::EmptyGradOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index f59f497d9f32069b764a9f777c7e9d6da9cdb108..04dfdf7c48381240108cf924979764966599151f 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -59,7 +59,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("data_type"));
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
   }
 };
 
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..139392c691e00b2a94f46801f1cfc2018ce139f5
--- /dev/null
+++ b/paddle/operators/increment_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/increment_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IncrementOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IncrementOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IncrementOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IncrementOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor of increment operator");
+    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
+    AddComment(R"DOC(Increment operator
+
+The equation is: Out = X + step
+)DOC");
+    AddAttr<AttrType>("step",
+                      "The step size by which the "
+                      "input tensor will be incremented.")
+        .SetDefault(1.0);
+  }
+};
+
+class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", 1.0f);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
+                  ops::IncrementGradOpMaker);
+REGISTER_OP_CPU_KERNEL(increment,
+                       ops::IncrementKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/increment_op.cu b/paddle/operators/increment_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..659c380d147a36650452bea23b30cbcf1ff516ee
--- /dev/null
+++ b/paddle/operators/increment_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/increment_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    increment,
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/increment_op.h b/paddle/operators/increment_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..342e254fc453555c70923efbca02fdfd014af015
--- /dev/null
+++ b/paddle/operators/increment_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class IncrementKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto step = static_cast<T>(context.Attr<AttrType>("step"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place = context.GetEigenDevice<Place>();
+    eigen_out.device(place) = eigen_in + step;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 9c506ae89bdda38f40fb37e4c4e5f990cd5978b7..443c94b83f0bf24837afe703b19e2ab47a0dd786 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -64,7 +64,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -85,8 +85,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
-                               *context);
+    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
     out_cfo_ptr = output_tmp.data<float>();
   }
   EXPECT_EQ(out_cfo_ptr[0], 0);
@@ -102,8 +101,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
-                               *context);
+    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
     out_ocf_ptr = output_tmp.data<float>();
   }
   EXPECT_EQ(out_ocf_ptr[0], 0);
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 14359d835bba794703a313d70f34082868474b20..8b22c71552a65044cbd02441fb35c1eafe0173dc 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+  out.CopyFrom(out_gpu, *cpu_place, context);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+  out.CopyFrom(out_gpu, *cpu_place, context);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+  input3.CopyFrom(input3_gpu, *cpu_place, context);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+  input3.CopyFrom(input3_gpu, *cpu_place, context);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 8a9f25b98263c3bef50c38f358a20ea98ebe6324..69607c5afc46921c08ce278bf164e5bed7b446f8 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom<float>(*out_value, cpu_place, ctx);
+  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom<float>(*tensor2, cpu_place, ctx);
+  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 2d69218843a69497b5b501d4297f2ec5ab26a844..74590d17cd0f974f830e760d85daef8ab5318a43 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -78,7 +78,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
@@ -93,7 +93,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace(), *context);
+    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -107,7 +107,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
   }
 
   paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@@ -118,7 +118,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace(), *context);
+    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 8ae54e1eec33c4bce563f697bafbdc68f97ab746..5ce30740c90b5cd0bd4f8ab183cf985ed5d827c1 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -46,7 +46,7 @@ class MatMulKernel : public framework::OpKernel<T> {
 template <typename T>
 inline Tensor Reshape(const Tensor& input, const DDim& dims) {
   Tensor output;
-  output.ShareDataWith<T>(input);
+  output.ShareDataWith(input);
   output.Resize(dims);
   return output;
 }
@@ -56,7 +56,7 @@ inline Tensor Reshape(const Tensor& input, const DDim& dims) {
 template <typename T>
 Tensor CombineBatchAndM(const Tensor& input) {
   Tensor output;
-  output.ShareDataWith<T>(input);
+  output.ShareDataWith(input);
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
     std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
@@ -80,7 +80,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
     std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
     output.Resize(make_ddim(out_dims));
   } else {
-    output.ShareDataWith<T>(input);
+    output.ShareDataWith(input);
   }
   return output;
 }
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 9be4d15a43d87ae1a27c81498e8b19b0049a3bfa..2d4d6f13720f0e6888edbddcb3243116506227ba 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,12 +75,17 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut", "(Tensor) Output updated velocity");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+        .SetDefault(false);
     AddComment(R"DOC(
 
-Momentum Algorithm (momentum).
+Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
 
 velocity = mu * velocity + gradient
-param = param - learning_rate * velocity
+if (use_nesterov):
+  param = param - gradient * learning_rate + mu * velocity * learning_rate
+else:
+  param = param - learning_rate * velocity
 
 )DOC");
   }
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
index f7a724f048782ceee8509ddafcb4834fd8dbba8a..e6d6d1da3df9f7e43a93fcc2e12658a01a491f81 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
@@ -34,6 +34,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
     float mu = ctx.Attr<float>("mu");
+    bool use_nesterov = ctx.Attr<bool>("useNesterov");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -46,8 +47,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<Place>();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
     v_out.device(place) = v * mu + g;
-    p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+    if (use_nesterov) {
+      p_out.device(place) = p - g * lr.broadcast(grad_dsize) +
+                            v_out * mu * lr.broadcast(grad_dsize);
+    } else {
+      p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+    }
   }
 };
 
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 684b1ea0c0c8ddabc9809cc05ed985e0cc250955..3f3e77595b701d428a728fc4727dd3ff4abee45f 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -36,12 +36,12 @@ class MulKernel : public framework::OpKernel<T> {
     Tensor* z = context.Output<Tensor>("Out");
     const Tensor x_matrix =
         x->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                   *x, context.template Attr<int>("x_num_col_dims"))
             : *x;
     const Tensor y_matrix =
         y->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                   *y, context.template Attr<int>("y_num_col_dims"))
             : *y;
 
@@ -59,30 +59,30 @@ class MulGradKernel : public framework::OpKernel<T> {
     int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix =
-        x->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*x, x_num_col_dims)
-                             : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*y, y_num_col_dims)
-                             : *y;
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
     const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dx, x_num_col_dims)
-                                               : *dx;
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
       math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
                              1, &dx_matrix, 0);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dy, y_num_col_dims)
-                                               : *dy;
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
       math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
                              1, &dy_matrix, 0);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 10cb0e005f483abe91b4ee862ea5b48305ec08c7..143a14fef5783f8ed085d4c4ce2afb3b190d0600 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -33,8 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
-                                  ctx.device_context());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
@@ -71,8 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
-                                  ctx.device_context());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index e3d08378c2f29fa5d84c24ae7cebfcb0e7a53b25..40303e3adf4db7e8336ed72667fe69afa56c3f69 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -42,7 +42,7 @@ void RecurrentAlgorithm::Run(const Scope& scope,
 
   for (size_t step_id = 0; step_id < seq_len; step_id++) {
     if (step_id > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, -1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
@@ -59,7 +59,8 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
 
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
-  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
+  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(),
+                 "step_unit_ op has no outputs");
 
   if (seq_len > step_scopes->size()) {
     for (size_t i = step_scopes->size(); i < seq_len; ++i) {
@@ -86,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
 }
 
 void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
-  for (auto& attr : arg_->memories) {
+  for (auto& attr : arg_->states) {
     auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists", attr.var,
@@ -95,17 +96,17 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
         step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
     pre_mem->Resize(boot_mem->dims());
     PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
-    pre_mem->ShareDataWith<float>(*boot_mem);
+    pre_mem->ShareDataWith(*boot_mem);
   }
 }
 
 const rnn::ArgumentName RecurrentOp::kArgName{
-    "step_net", "step_scopes",  "inlinks",      "outlinks",
-    "memories", "pre_memories", "boot_memories"};
+    "step_net", "step_scopes", "inputs",        "outputs",
+    "states",   "ex_states",   "initial_states"};
 
 const rnn::ArgumentName RecurrentGradientOp::kArgName{
-    "step_net", "step_scopes@GRAD", "outlinks@GRAD",     "inlinks@GRAD",
-    "memories", "pre_memories",     "boot_memories@GRAD"};
+    "step_net", "step_scopes@GRAD", "outputs@GRAD",       "inputs@GRAD",
+    "states",   "ex_states",        "initial_states@GRAD"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
                          const framework::VariableNameMap& inputs,
@@ -127,7 +128,7 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddInput(name.inlinks,
              "the inputs that need to be segmented for each step.")
         .AsDuplicable();
-    AddInput(name.boot_memories, "variables to initialize memories.")
+    AddInput(name.initial_states, "variables to initialize states.")
         .AsDuplicable();
 
     AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
@@ -135,9 +136,8 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddOutput(name.step_scopes, "step scopes");
 
     // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.pre_memories,
-                                      "names of pre-memories");
-    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of pre-states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
 
     AddComment("This is a recurrent group operator.");
   }
@@ -152,7 +152,7 @@ void RecurrentGradientAlgorithm::Run(
   rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
   for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
     if (static_cast<size_t>(step_id) != seq_len - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
@@ -162,7 +162,7 @@ void RecurrentGradientAlgorithm::Run(
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
     Scope* step_scope) const {
-  for (auto& attr : arg_->memories) {
+  for (auto& attr : arg_->states) {
     PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
                    "memory variable [%s] does not exists", attr.var);
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
@@ -171,7 +171,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
     auto* boot_mem_grad =
         step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
     boot_mem_grad->Resize(mem_grad->dims());
-    boot_mem_grad->ShareDataWith<float>(*mem_grad);
+    boot_mem_grad->ShareDataWith(*mem_grad);
   }
 }
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 5e878353ce978830ede03ca6284719615ed39718..46f66a1370a35593d1911fc9b3ce76beb38c0956 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -159,6 +160,66 @@ class ReduceMinOpMaker : public ReduceOpMaker {
   }
 };
 
+class NormOp : public NetOp {
+ public:
+  NormOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
+                      "Input(X) of NormOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("AbsOut"), framework::kEmptyVarName,
+                      "Output(AbsOut) of NormOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("PowOut"), framework::kEmptyVarName,
+                      "Output(PowOut) of NormOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
+                      "Output(SumOut) of NormOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+                      "Output(Out) of NormOp should not be null.");
+    auto dim = Attr<int>("dim");
+    auto keep_dim = Attr<bool>("keep_dim");
+    auto p = Attr<float>("p");
+    PADDLE_ENFORCE_GT(p, 0, "Order of the norm should be positive.");
+    AppendOp(framework::OpRegistry::CreateOp("abs", {{"X", {Input("X")}}},
+                                             {{"Y", {Output("AbsOut")}}}, {}));
+    AppendOp(framework::OpRegistry::CreateOp("pow", {{"X", {Output("AbsOut")}}},
+                                             {{"Y", {Output("PowOut")}}},
+                                             {{"factor", p}}));
+    framework::AttributeMap sum_attr;
+    sum_attr["dim"] = dim;
+    sum_attr["keep_dim"] = keep_dim;
+    AppendOp(framework::OpRegistry::CreateOp(
+        "reduce_sum", {{"X", {Output("PowOut")}}},
+        {{"Out", {Output("SumOut")}}}, sum_attr));
+    AppendOp(framework::OpRegistry::CreateOp(
+        "pow", {{"X", {Output("SumOut")}}}, {{"Y", {Output("Out")}}},
+        {{"factor", static_cast<float>(1. / p)}}));
+    CompleteAddOp(false);
+  }
+};
+
+class NormOpMaker : public ReduceOpMaker {
+ public:
+  NormOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    AddOutput("AbsOut",
+              "(Tensor) The intermediate output of Norm operator, "
+              "saving the absolute value of the input tensor X.")
+        .AsIntermediate();
+    AddOutput("PowOut",
+              "(Tensor) The intermediate output of Norm operator, "
+              "saving the p-th power of the output tensor AbsOut.")
+        .AsIntermediate();
+    AddOutput("SumOut",
+              "(Tensor) the intermediate output of Norm operator, "
+              "saving the sum of PowOut reduced on the given dimension.")
+        .AsIntermediate();
+    AddAttr<float>("p", "(float, default 2) The order of Norm.").SetDefault(2);
+    SetComment("Norm", "vector p-norm");
+    AddComment(comment_);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -176,6 +237,8 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
 REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
             ops::ReduceGradOp);
 
+REGISTER_OP_WITHOUT_GRADIENT(norm, ops::NormOp, ops::NormOpMaker);
+
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)     \
   REGISTER_OP_CPU_KERNEL(                                                  \
       reduce_type,                                                         \
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 3ba4611458fda0aa2f234c29d27086cd6f5742cc..c89cdf8cab9f209667c5e09b521b8f6e30f202fd 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    out->CopyFrom<T>(*in, ctx.GetPlace(), ctx.device_context());
+    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
     out->Resize(out_dims);
   }
 };
@@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom<T>(*d_out, ctx.GetPlace(), ctx.device_context());
+    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index 30b8ddeb5bc4220e261a5c37ac195b0348fef936..ee61ea300c33722471189d06eb09f67a083d2a4d 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -36,14 +36,14 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     LoDTensor* input = input_var->GetMutable<LoDTensor>();
     f::DDim dims = input->dims();
     PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inlinks be the same length");
+                      "all the inputs be the same length");
     f::DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
           step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
       // The input of operators of each step is Tensor here.
       // Maybe need to modify Slice function.
-      *step_input = input->Slice<float>(j, j + 1);
+      *step_input = input->Slice(j, j + 1);
       step_input->Resize(step_dims);
     }
   }
@@ -71,14 +71,14 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
           step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
       // TODO(luotao02) data type and platform::DeviceContext() should set
       // correctly
-      (output->Slice<float>(j, j + 1))
-          .CopyFrom<float>(*step_output, platform::CPUPlace(), ctx);
+      (output->Slice(j, j + 1))
+          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
     }
   }
 }
 
 void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::MemoryAttr>& memories,
+                  const std::vector<rnn::StateAttr>& memories,
                   const size_t step_id, const int offset) {
   PADDLE_ENFORCE_LT(step_id, scopes.size(),
                     "step [%d] is out of range of step scopes' size [%d]",
@@ -95,7 +95,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
     auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
     auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
     mem->Resize(linked_mem->dims());
-    mem->ShareDataWith<float>(*linked_mem);
+    mem->ShareDataWith(*linked_mem);
   }
 }
 
@@ -106,26 +106,26 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->inlinks = op.Inputs(name.inlinks);
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto& boot_memories =
-      is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
+  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
+                                : op.Inputs(name.initial_states);
   // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.memories);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
+  auto& memories = op.Attr<std::vector<std::string>>(name.states);
+  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of memories, boot_memories don't match:%d,%d",
+                 "the size of states, initial_states don't match:%d,%d",
                  memories.size(), boot_memories.size());
   PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 "the size of ex_states, initial_states don't match:%d,%d",
                  pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
 
   for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::MemoryAttr mem_attr;
+    rnn::StateAttr mem_attr;
     mem_attr.var = memories[i];
     mem_attr.pre_var = pre_memories[i];
     mem_attr.boot_var = boot_memories[i];
-    (arg->memories).push_back(mem_attr);
+    (arg->states).push_back(mem_attr);
   }
 }
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index fe173edb24ad015b9546546565027358f9b93476..fb0e158e07745d58c6211d33e385b324e492b95e 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -31,7 +31,7 @@ using Scope = framework::Scope;
  * boot memories in father scope. Other attributes are copied from Op's proto
  * attributes.
  */
-struct MemoryAttr {
+struct StateAttr {
   // name of current state variable
   std::string var;
   // name of previous step's state variable
@@ -46,7 +46,7 @@ struct Argument {
   std::string step_scopes;
   std::vector<std::string> inlinks;
   std::vector<std::string> outlinks;
-  std::vector<rnn::MemoryAttr> memories;
+  std::vector<rnn::StateAttr> states;
 };
 
 struct ArgumentName {
@@ -54,9 +54,9 @@ struct ArgumentName {
   std::string step_scopes;
   std::string inlinks;
   std::string outlinks;
-  std::string memories;       // the memory name
-  std::string pre_memories;   // the previous memory name
-  std::string boot_memories;  // the boot memory name
+  std::string states;          // the memory name
+  std::string ex_states;       // the previous memory name
+  std::string initial_states;  // the boot memory name
 };
 
 /**
@@ -74,7 +74,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const size_t seq_len, const platform::DeviceContext& ctx);
 
 void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<MemoryAttr>& memories, const size_t step_id,
+                  const std::vector<StateAttr>& memories, const size_t step_id,
                   const int offset);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 06f4d759447b6dcd28b50576dfc246fc466d9336..3b32ae2fb77a5d3d4c558742ec469c74d15eee07 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -30,7 +30,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    Out->ShareDataWith<T>(*Ref);
+    Out->ShareDataWith(*Ref);
 
     GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
@@ -48,7 +48,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     // In place gradient: dRef = dO
-    dRef->ShareDataWith<T>(*dOut);
+    dRef->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates = dO[Index]
     GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index 6101219006414e4865f676e3ca5d2a88949ad17a..1a4f6f99bfe36cd0de2d4f2af3f6054571d8f188 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     auto *Out = ctx.Output<Tensor>("Out");
 
     // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith<T>(*Ref);
+    Out->ShareDataWith(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
     ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     // In place gradient: dRef = dO
-    dRef->ShareDataWith<T>(*dOut);
+    dRef->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
     CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index a197a05bbb881806b24f9dcce5282a4d972e3adc..6adf96120c99f9b84a1ff947058e65ac3ddff1d4 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -87,16 +87,16 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
 
     auto out_lod_level = out_lod[level];
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
-      Tensor out_t = out->Slice<T>(static_cast<int>(out_lod_level[i]),
-                                   static_cast<int>(out_lod_level[i + 1]));
+      Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
+                                static_cast<int>(out_lod_level[i + 1]));
       auto out_stride = framework::stride(out_t.dims());
       size_t offset = 0;
 
       for (size_t j = 0; j < n; ++j) {
         auto in_lod_level = ins[j]->lod()[level];
         auto in_stride = framework::stride(ins[j]->dims());
-        Tensor in_t = ins[j]->Slice<T>(static_cast<int>(in_lod_level[i]),
-                                       static_cast<int>(in_lod_level[i + 1]));
+        Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
+                                    static_cast<int>(in_lod_level[i + 1]));
         size_t axis_dim = in_t.dims()[axis];
         StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
                          in_t.dims(), out_stride, out_t.data<T>() + offset);
@@ -130,8 +130,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
 
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_grad_t =
-          out_grad->Slice<T>(static_cast<int>(out_lod_level[i]),
-                             static_cast<int>(out_lod_level[i + 1]));
+          out_grad->Slice(static_cast<int>(out_lod_level[i]),
+                          static_cast<int>(out_lod_level[i + 1]));
       auto out_grad_stride = framework::stride(out_grad_t.dims());
       size_t offset = 0;
 
@@ -139,8 +139,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
         auto x_grad_lod_level = x_grads[j]->lod()[level];
         auto x_grad_stride = framework::stride(x_grads[j]->dims());
         Tensor x_grad_t =
-            x_grads[j]->Slice<T>(static_cast<int>(x_grad_lod_level[i]),
-                                 static_cast<int>(x_grad_lod_level[i + 1]));
+            x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
+                              static_cast<int>(x_grad_lod_level[i + 1]));
         size_t axis_dim = x_grad_t.dims()[axis];
         StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>() + offset,
                          out_grad_stride, out_grad_t.dims(), x_grad_stride,
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index a5569d1aace215c848de43dd9c3dcb414b709083..0de6cafe9ca83f09636a69b5579d19afde1c73b5 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -64,9 +64,9 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice<T>(static_cast<int>(lod_level_0[i]),
-                                 static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice<T>(i, i + 1);
+      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = out->Slice(i, i + 1);
       int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
@@ -116,9 +116,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     }
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
-                                   static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice<T>(i, i + 1);
+      auto in_g_t =
+          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_g->Slice(i, i + 1);
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 96d87c404d217280d74bd088e7a23f539ef6e7ce..3eb1e2844dff6ac94e86dcf4586bb51bc33adbec 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -46,8 +46,8 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
     for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
-      Tensor x_i = x->Slice<T>(start_pos, end_pos);
-      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
 
       // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
       framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
@@ -75,9 +75,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
 
-      Tensor out_i = out->Slice<T>(start_pos, end_pos);
-      Tensor out_grad_i = out_grad->Slice<T>(start_pos, end_pos);
-      Tensor x_grad_i = x_grad->Slice<T>(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
 
       // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
       framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index d03a1a76585bc79633d089b776ca07ba908085ba..68ac2b0ea36dda55ac1161eecb80f03178b4f303 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -85,7 +85,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
     T* logit_grad_data = logit_grad->data<T>();
 
     const int batch_size = logit_grad->dims()[0];
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 66d7bc1569e124096f30b6cd91fe22189506e4a5..01027cf63fc1010a226346609d583af0b400ecbb 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
     if (context.Attr<bool>("soft_label")) {
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 612bdd70db28f2c1fbeb66456fae4ca865530f1f..39b53948e3cc58ff1d0ab481143b066b1a2fae16 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -53,10 +53,10 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto& dims = ctx->Attrs().Get<std::vector<int>>("dims");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
-    temp.reserve(dims.size());
-    for (auto dim : dims) {
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
       temp.push_back(static_cast<int64_t>(dim));
     }
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
@@ -65,7 +65,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("data_type"));
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
   }
 };
 
@@ -78,7 +78,7 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(Uniform random operator.
 Used to initialize tensor with uniform random generator.
 )DOC");
-    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<std::vector<int>>("shape", "the dimension of random tensor");
     AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
     AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
     AddAttr<int>("seed",
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 895e8d6a63d1fad0ee7a6f5647402435d418b2f1..f157188a4f736319ea187052b90a17f8be9e9edb 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -265,6 +265,10 @@ public:
     addParameterType(PARAMETER_SECOND_MOMENTUM);
   }
 
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
   virtual void finishBatch() { ++step_; }
 
   virtual void update(const VectorPtr vecs[],
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 58739d888ac537222fdfeff0c52bd90ab58362c1..405ac544e10f19a33399a649f76699fefc3d49b9 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -222,7 +222,9 @@ void BindVarDsec(py::module &m) {
 
   py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
       .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS);
+      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", VarDesc::FETCH_LIST);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 16661b93e56da30ecd3848d28a0f4667b710e80c..26b793a4bbf5df7a2635838a6c6a8264ca8ebb67 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -84,10 +84,12 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
+      .def("set", PyCPUTensorSetFromArray<int64_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
+      .def("set", PyCUDATensorSetFromArray<int64_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -111,6 +113,7 @@ PYBIND11_PLUGIN(core) {
              new (&instance) LoDTensor(new_lod);
 #endif
           })
+      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
 #ifndef PADDLE_WITH_CUDA
@@ -264,6 +267,17 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
+  py::class_<platform::Place>(m, "Place")
+      .def(py::init<>())
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::GPUPlace &gpu_place) {
+             self = gpu_place;
+           });
+
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
                   [](py::bytes protobin) {
@@ -399,18 +413,18 @@ All parameter, weight, gradient are variables in Paddle.
                     return static_cast<operators::DynamicRecurrentOp *>(
                         rnn_op.release());
                   })
-      .def("set_stepnet",
+      .def("set_step_unit",
            [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.SetStepNet(net.Clone()); })
+               -> void { self.rnn.SetStepUnit(net.Clone()); })
       .def("get_state",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.state(name); })
+               -> const TensorArray & { return self.rnn.state(name); })
       .def("get_step_input",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.step_input(name); })
+               -> const TensorArray & { return self.rnn.step_input(name); })
       .def("get_step_output",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.step_output(name); });
+               -> const TensorArray & { return self.rnn.step_output(name); });
 
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
@@ -436,18 +450,15 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<std::vector<platform::Place> &>())
-      .def("run",
-           [](Executor &self, const ProgramDesc &program_desc, int block_id) {
-             framework::Scope &global_scope = GetGlobalScope();
-             self.Run(program_desc, &global_scope, block_id);
-           });
+      .def("run", [](Executor &self, ProgramDescBind *program_bind,
+                     Scope *scope, int block_id) {
+        self.Run(*program_bind->Proto(), scope, block_id);
+      });
 
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);
-  m.def("set_feed_variable_float", framework::SetFeedVariable<float>);
-  m.def("set_feed_variable_double", framework::SetFeedVariable<double>);
-  m.def("set_feed_variable_int", framework::SetFeedVariable<int>);
+  m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
   BindProgramDesc(m);
@@ -455,6 +466,8 @@ All parameter, weight, gradient are variables in Paddle.
   BindVarDsec(m);
   BindOpDesc(m);
 
+  m.def("op_support_gpu", OpSupportGPU);
+
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/scripts/cluster_train_v2/fabric/conf.py b/paddle/scripts/cluster_train_v2/fabric/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96503d093a4317df7bb006043eb42098f51b6f5
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+HOSTS = [
+    "root@10.1.9.7",
+    "root@10.1.18.7",
+    "root@10.1.32.9",
+]
+'''
+workspace configuration
+'''
+#root dir for workspace, can be set as any director with real user account
+ROOT_DIR = "/root"
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 1
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 1
+#trainer whether use gpu
+PADDLE_USE_GPU = "False"
+#environments setting for all processes in cluster job
+LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6606c01265af1fa8009e67906a3dbbe5c95ebc0d
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
@@ -0,0 +1,11 @@
+FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
+RUN apt-get update && apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+
+RUN echo 'root:root' |chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0784b2d1b8785796f94fff1607643218564fc126
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
@@ -0,0 +1,23 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: ssh-servers
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: ssh-servers
+    spec:
+      containers:
+      - name: ssh-servers
+        image: docker.paddlepaddlehub.com/paddlessh
+        resources:
+          limits:
+            cpu: 500m
+            memory: 1Gi
+          requests:
+            cpu: 500m
+            memory: 1Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6324bcb136803ebc30e69bcdaa2f8725cb0ccba
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+python paddle.py \
+  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
+  --dot_period=10 \
+  --ports_num_for_sparse=1 \
+  --log_period=50 \
+  --num_passes=5 \
+  --trainer_count=2 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --save_dir=./output \
+  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1a2d19e823541750830fcaa25f65b2f8e1ea2b49
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -0,0 +1,43 @@
+# Build this image:  docker build -t mpi .
+#
+
+FROM paddledev/paddle:0.10.0rc3
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y && \
+    apt-get upgrade -y && \
+    apt-get install -y openssh-server zip unzip vim sudo \
+gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
+pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
+mkdir /var/run/sshd && \
+echo 'root:tutorial' | chpasswd && \
+sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+# SSH login fix. Otherwise user is kicked off after login
+sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
+echo "export VISIBLE=now" >> /etc/profile && \
+adduser --disabled-password --gecos "" tutorial && \
+echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
+mkdir /home/tutorial/.ssh/
+
+ENV HOME /home/tutorial
+ENV NOTVISIBLE "in users profile"
+
+# ------------------------------------------------------------
+# Set-Up SSH with our Github deploy key
+# ------------------------------------------------------------
+
+ADD ssh/config /home/tutorial/.ssh/config
+ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
+
+#---------------------------------------------------------------
+#LD_LIBRARY_PATH
+#---------------------------------------------------------------
+
+RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
+
+WORKDIR /home/tutorial
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34835e5eb8d7cb92ad3cf7758a47c9e565a7dcf6
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
@@ -0,0 +1,25 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-header
+  labels:
+    app: mpi-header
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: mpi-header
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-header
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2fd5cb4d44a25efac68dd8c9195dea9fd8f84a26
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
@@ -0,0 +1,26 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-nodes
+  labels:
+    app: mpi-nodes
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: mpi-nodes
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-nodes
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
+        imagePullPolicy: Always
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
new file mode 100644
index 0000000000000000000000000000000000000000..a9ecad07c39e4a9d6f0572d6cbf77795d99681f2
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
@@ -0,0 +1 @@
+StrictHostKeyChecking no
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
new file mode 100644
index 0000000000000000000000000000000000000000..23768343edf5258cf525523d471f67071a24f5de
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
+1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
+O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
+36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
+mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
+bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
+OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
+TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
+79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
+YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
+mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
+lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
+rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
+DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
+44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
+fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
+cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
+g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
+yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
+PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
+v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
+hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
+sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
+zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
+yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
+-----END RSA PRIVATE KEY-----
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
new file mode 100644
index 0000000000000000000000000000000000000000..015f2b42e71920e00de090cbb1108d9a12ed5f0c
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c645495448f9844de5ae9024b6a0f41452522765
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# General trainning configurations
+
+NICS=eth0
+PADDLE_INIT_PORT=7164
+PADDLE_INIT_PORTS_NUM=1
+PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
+PADDLE_INIT_USE_GPU=False
+
+PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
+PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+PADDLE_CLUSTER_TRAIN=True
+
+env
+
+# start pserver
+stdbuf -oL nohup paddle pserver --port=$PADDLE_INIT_PORT --ports_num=$PADDLE_INIT_PORTS_NUM \
+  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE --nics=$NICS \
+  --comment=paddle_cluster_pserver \
+  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS &> logs/pserver.log &
+
+# start trainer
+# NOTE: train.py will use the above environment variables as configuration
+python train.py &> logs/train.log
+
+# kill background pservers when train finishes
+ps -ef | grep pserver | awk '{print $2}' | xargs kill
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2ac455d771bf78377ce4ee7d921393d3b3958e3c..a08716c5a559def54bb7b989f250b489f6a805a2 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -141,10 +141,17 @@ RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
 EOF
 fi
 
+if [[ ${WITH_GPU} == "ON"  ]]; then
+  NCCL_DEPS="apt-get install -y libnccl-dev &&"
+else
+  NCCL_DEPS="" 
+fi
+
 cat >> /paddle/build/Dockerfile <<EOF
 ADD python/dist/*.whl /
 # run paddle version to install python packages first
 RUN apt-get update &&\
+    ${NCCL_DEPS}\
     apt-get install -y wget python-pip && pip install -U pip && \
     pip install /*.whl; apt-get install -f -y && \
     apt-get clean -y && \
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
index c2594bc13c250a877a7b8a77e11405671c4d8907..a073708a184d6392a4eead69272e684013f1c751 100644
--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 256,
+settings(batch_size = 128,
          learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)
@@ -44,10 +44,11 @@ a2 = img_conv_layer(input=tmp,
             shared_biases=True,
             act=ReluActivation())
 
-tmp = concat_layer(input=[a1, a2])
+tmp = addto_layer(input=[a1, a2],
+            act=ReluActivation(),
+            bias_attr=False)
 
 tmp = img_pool_layer(input=tmp,
-            num_channels=64,
             pool_size=3,
             stride=2,
             padding=1,
@@ -55,35 +56,34 @@ tmp = img_pool_layer(input=tmp,
 
 b1 = img_conv_layer(input=tmp,
             filter_size=3,
-            num_filters=64,
+            num_filters=32,
             padding=1,
             shared_biases=True,
             act=ReluActivation())
 
 b1 = img_pool_layer(input=b1,
             pool_size=3,
-            stride=1,
-            padding=1,
+            stride=2,
+            padding=0,
             pool_type=MaxPooling())
 
 b2 = img_conv_layer(input=tmp,
-            filter_size=5,
+            filter_size=3,
             num_filters=64,
-            padding=2,
+            padding=1,
             shared_biases=True,
             act=ReluActivation())
 
 b2 = img_pool_layer(input=b2,
             pool_size=5,
-            stride=1,
-            padding=2,
+            stride=2,
+            padding=1,
             pool_type=MaxPooling())
 
-tmp = addto_layer(input=[b1, b2],
-            act=ReluActivation(),
-            bias_attr=False)
+tmp = concat_layer(input=[b1, b2])
 
 tmp = img_pool_layer(input=tmp,
+            num_channels=96,
             pool_size=3,
             stride=2,
             padding=1,
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
index 77f78161535c49da4ef7fc1563cff58c021aecef..2ba71884d0953dc721808732fde12e695c6a757d 100644
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
+settings(batch_size = 128,
          learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 248da4ae8d1fb24652625ae8fc9ef314a028b912..05635833bf1645f78f5ba15caee3e9b8da9f5544 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -175,7 +175,7 @@ def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
 
 dense_vector = dense_slot
 sparse_binary_vector = sparse_non_value_slot
-sparse_vector = sparse_value_slot
+sparse_float_vector = sparse_value_slot
 integer_value = index_slot
 
 # dense_array can be used for variable-length input feature.
@@ -216,7 +216,7 @@ def sparse_binary_vector_sub_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def sparse_vector_sequence(dim):
+def sparse_float_vector_sequence(dim):
     """
     Data type of a sequence of sparse vector, which most elements are zero,
     others could be any float value.
@@ -226,11 +226,11 @@ def sparse_vector_sequence(dim):
     :return: An input type object
     :rtype: InputType
     """
-    return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
+    return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
-def sparse_vector_sub_sequence(dim):
-    return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+def sparse_float_vector_sub_sequence(dim):
+    return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
 def integer_value_sequence(value_range):
diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/framework/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b83d4bb6ac9d4c6a67d925db290c7c5e2d933f
--- /dev/null
+++ b/python/paddle/v2/framework/executor.py
@@ -0,0 +1,64 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import Block, Program
+
+g_scope = core.Scope()
+
+
+class Executor(object):
+    def __init__(self, places):
+        if not isinstance(places, list) and not isinstance(places, tuple):
+            places = [places]
+
+        act_places = []
+        for each in places:
+            p = core.Place()
+            p.set_place(each)
+            act_places.append(p)
+
+        self.executor = core.Executor(act_places)
+
+    def run(self,
+            program,
+            feed,
+            fetch_list,
+            feed_var_name='feed',
+            fetch_var_name='fetch',
+            scope=None):
+        if not isinstance(program, Program):
+            raise TypeError()
+
+        if scope is None:
+            scope = g_scope
+
+        program = program.clone()
+        global_block = program.global_block()
+        feed_var = global_block.create_var(
+            name=feed_var_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed):
+            out = global_block.var(name)
+            global_block.prepend_op(
+                'feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+            core.set_feed_variable(scope, feed[name], feed_var.name, i)
+
+        fetch_var = global_block.create_var(
+            name=fetch_var_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+        for i, var in enumerate(fetch_list):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+        self.executor.run(program.desc, scope, 0)
+        return [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index a24c78171e07249beeb8131d2d338b01c30e5c1f..03a3dacf25c2ad5514e914d2f6e9637493ba80f4 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -15,7 +15,7 @@ class Variable(object):
                  shape=None,
                  dtype=None,
                  lod_level=None,
-                 persistable=False,
+                 persistable=None,
                  **kwargs):
         self.block = block
 
@@ -256,7 +256,8 @@ class Operator(object):
                     self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
 
         self.desc.check_attrs()
-        self.desc.infer_shape(self.block.desc)
+        if type not in {'feed', 'fetch'}:
+            self.desc.infer_shape(self.block.desc)
 
     def __str__(self):
         protostr = self.desc.serialize_to_string()
@@ -323,9 +324,12 @@ class Block(object):
         return self.desc.id
 
     def var(self, name):
-        if name not in self.vars:
+        if not isinstance(name, basestring):
+            raise TypeError()
+        v = self.vars.get(name, None)
+        if v is None:
             raise ValueError("var %s not in this block" % name)
-        return self.vars[name]
+        return v
 
     def all_parameters(self):
         return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
@@ -339,6 +343,8 @@ class Block(object):
     def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
+        if 'init_attr' in kwargs:
+            self._prepend_initialize_ops_(param, kwargs['init_attr'])
         return param
 
     def append_op(self, *args, **kwargs):
@@ -397,6 +403,17 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def _prepend_initialize_ops_(self, param, init_attr):
+        op_type = init_attr['type']
+        init_attr['shape'] = param.shape
+        init_attr['data_type'] = int(param.data_type)
+        op = self.prepend_op(
+            type=op_type,
+            inputs=None,
+            outputs={'Out': [param]},
+            attrs=init_attr)
+        param.op = op
+
 
 class Program(object):
     def __init__(self):
@@ -428,11 +445,13 @@ class Program(object):
     def current_block(self):
         return self.blocks[self.current_block_idx]
 
-    def append_backward(self, target, no_grad_set):
+    def append_backward(self, target, no_grad_set=None):
         """
         return map(param_name -> (grad_name, block_index, op_index))
         """
         assert isinstance(target, Variable)
+        if no_grad_set is None:
+            no_grad_set = set()
         param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
         self.sync_with_cpp()
         return param_to_grad_info
@@ -469,27 +488,10 @@ class Parameter(Variable):
         Variable.__init__(
             self, block, persistable=True, shape=shape, dtype=dtype, **kwargs)
         self.trainable = kwargs.get('trainable', True)
-        self.init_attr = kwargs.get('initialize_attr', {
-            'type': 'uniform_random',
-            'min': -1.0,
-            'max': 1.0
-        })
 
         self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
-        self._append_initialize_ops_()
-
-    def _append_initialize_ops_(self):
-        attr = self.init_attr
-        op_type = attr.pop('type', None)
-        block = self.block
-        assert isinstance(block, Block)
-        shape = self.shape
-        attr['dims'] = shape
-        attr['data_type'] = int(self.data_type)
-        op = block.prepend_op(
-            type=op_type, inputs=None, outputs={'Out': [self]}, attrs=attr)
-        self.op = op
 
 
 # program is a global instance.
 g_program = Program()
+g_init_program = Program()
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
index 6615bdcd3b1afa493c9ad05c789664818e64d2f2..849a6f43065ae95e908e449e9ef9300b64692e5e 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program
+from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program, g_init_program
 import paddle.v2.framework.core as core
 import copy
 import itertools
@@ -29,6 +29,14 @@ class LayerHelper(object):
         else:
             return prog
 
+    @property
+    def init_program(self):
+        prog = self.kwargs.get('init_program', None)
+        if prog is None:
+            return g_init_program
+        else:
+            return prog
+
     def append_op(self, *args, **kwargs):
         return self.program.current_block().append_op(*args, **kwargs)
 
@@ -66,16 +74,14 @@ class LayerHelper(object):
         actual = self.kwargs.get('param_attr', None)
         return actual if actual is not None else default
 
-    def bias_attr(self, shape, dtype):
+    def bias_attr(self):
         bias_attr = self.kwargs.get('bias_attr', None)
         if bias_attr is True:
             bias_attr = {
                 'name': None,
                 'init_attr': {
                     'type': 'fill_constant',
-                    'value': 0.0,
-                    'shape': shape,
-                    'dataType': dtype
+                    'value': 0.0
                 }
             }
         return bias_attr
@@ -113,22 +119,27 @@ class LayerHelper(object):
     def create_parameter(self, attr, shape, dtype, suffix='w'):
         if attr['name'] is None:
             attr['name'] = unique_name(".".join([self.name, suffix]))
-        return self.program.global_block().create_parameter(
+        self.init_program.global_block().create_parameter(
             name=attr['name'],
             dtype=dtype,
             shape=shape,
-            initialize_attr=attr['init_attr'])
+            init_attr=attr['init_attr'])
+        return self.program.global_block().create_parameter(
+            name=attr['name'], dtype=dtype, shape=shape)
 
     def create_tmp_variable(self, dtype):
         return self.program.current_block().create_var(
-            name=unique_name(".".join([self.name, 'tmp'])), dtype=dtype)
+            name=unique_name(".".join([self.name, 'tmp'])),
+            dtype=dtype,
+            persistable=False)
 
     def create_global_variable(self, *args, **kwargs):
-        return self.program.global_block().create_var(*args, **kwargs)
+        return self.program.global_block().create_var(
+            *args, persistable=False, **kwargs)
 
     def append_bias_op(self, input_var):
         size = list(input_var.shape[1:])
-        bias_attr = self.bias_attr(size, dtype=input_var.data_type)
+        bias_attr = self.bias_attr()
         if not bias_attr:
             return input_var
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index c7397716c47dd7088d840edb00d96dda2fe88f1d..ac77aefa15333b06f9803ce1d91071df803483d1 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.core as core
 from paddle.v2.framework.framework import OpProtoHolder, Variable
 import re
 
-__all__ = ['fc', 'data', 'cross_entropy', 'conv2d']
+__all__ = ['fc', 'data', 'cross_entropy', 'conv2d', 'pool2d']
 
 
 def fc(input,
@@ -13,7 +13,8 @@ def fc(input,
        name=None,
        act=None,
        num_flatten_dims=1,
-       program=None):
+       program=None,
+       init_program=None):
     # create helper
     helper = LayerHelper('fc', **locals())
 
@@ -35,7 +36,10 @@ def fc(input,
                 "Y": w,
             },
             outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims})
+            attrs={
+                'x_num_col_dims': num_flatten_dims,
+                'y_num_col_dims': len(input_shape) - num_flatten_dims
+            })
         mul_results.append(tmp)
 
     # sum
@@ -55,9 +59,12 @@ def data(name,
          shape,
          data_type='float32',
          type=core.VarDesc.VarType.LOD_TENSOR,
-         program=None):
+         append_batch_size=True,
+         program=None,
+         init_program=None):
     helper = LayerHelper('data', **locals())
-    shape = [-1] + shape  # append batch size as -1
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
     return helper.create_global_variable(
         name=name, shape=shape, dtype=data_type, type=type)
 
@@ -112,7 +119,7 @@ def _create_op_func_(op_type):
 
 
 _create_op_func_('mean')
-_create_op_func_('pool2d')
+_create_op_func_('mul')
 
 
 def cross_entropy(input, label, **kwargs):
@@ -155,7 +162,8 @@ def conv2d(input,
            padding=None,
            bias_attr=None,
            param_attr=None,
-           program=None):
+           program=None,
+           init_program=None):
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -167,6 +175,13 @@ def conv2d(input,
             raise ValueError("num_channels must be divisible by groups.")
         num_filter_channels = num_channels / groups
 
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
     filter = helper.create_parameter(
@@ -187,3 +202,41 @@ def conv2d(input,
     pre_act = helper.append_bias_op(pre_bias)
 
     return helper.append_activation(pre_act)
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=[1, 1],
+           pool_padding=[0, 0],
+           global_pooling=False,
+           program=None,
+           init_program=None):
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a83ebfb9639f6fae6344b68509a80580881dab0
--- /dev/null
+++ b/python/paddle/v2/framework/nets.py
@@ -0,0 +1,27 @@
+import paddle.v2.framework.layers as layers
+
+
+def simple_img_conv_pool(input,
+                         filter_size,
+                         num_filters,
+                         pool_size,
+                         pool_stride,
+                         act,
+                         program=None,
+                         init_program=None):
+    conv_out = layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        act=act,
+        program=program,
+        init_program=init_program)
+
+    pool_out = layers.pool2d(
+        input=conv_out,
+        pool_size=pool_size,
+        pool_type='max',
+        pool_stride=pool_stride,
+        program=program,
+        init_program=init_program)
+    return pool_out
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py
index e356a7aadb8d6a87d0fe54a5dd2a11fea0d80a74..ba2713e34dbfeca6990c49d0388e0886426b921a 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
@@ -1,32 +1,104 @@
 import paddle.v2.framework.framework as framework
+from collections import defaultdict
 
-__all__ = ['SGDOptimizer']
+__all__ = ['SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer']
 
 
 class Optimizer(object):
     """Optimizer Base class.
 
     Define the common interface of an optimizer.
-    User should not use this class directly, but need to use one of it's implementation.
+    User should not use this class directly,
+    but need to use one of it's implementation.
     """
 
     def __init__(self):
-        pass
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra variables associated with the parameters
+        # to train. These variables are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
 
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
         """
         raise NotImplementedError()
 
-    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+    def _initialize_tensors(self, block):
+        """Create all necessary tensors, that will be shared for all parameter updates.
+
+        Tensors like learning rate should be initialized here.
+
+        Args:
+            block: the block in which the loss variable is present
+        """
+        pass
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
         """
-        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
-        for parameters in parameter_list
+        pass
+
+    def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be added
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+        """
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            raise Exception("Accumulator {} already exists for parmeter {}".
+                            format(name, param.name))
+        global_block = block.program.global_block()
+        param_shape = list(param.shape)
+        param_acc = global_block.create_var(
+            dtype=dtype, shape=param_shape, lod_level=0)
+
+        # Initialize the accumulator with fill_value
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": param_acc},
+            attrs={"shape": param_shape,
+                   "value": fill_value})
+
+        # Add to accumulators dict
+        self._accumulators[name][param.name] = param_acc
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+        """Create and add gradient Operators in BlockDesc to compute
+        gradients of `loss` for parameters in parameter_list
 
         Args:
           loss: an variable generated by cost function.
           no_grad_set: variable that should not create gradient
-          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+          parameter_list: parameters that need to compute gradient and
+          update to optimize the lost.
 
         Returns:
           list of (parameters, gradients) pair.
@@ -48,7 +120,8 @@ class Optimizer(object):
             if not grad_block.has_var(grad_info[0]):
                 raise Exception("grad block[%d] did not have grad var %s" %
                                 grad_info[1], grad_info[0])
-            param_var = loss.block.var(param)
+            # Get the param var from the global block
+            param_var = loss.block.program.global_block().var(param)
             grad_var = grad_block.var(grad_info[0])
             if loss.block.has_var(grad_info[0]):
                 params_and_grads.append((param_var, grad_var))
@@ -64,14 +137,29 @@ class Optimizer(object):
           parameters_and_grads: a list of (variable, gradient) pair to update.
 
         Returns:
-          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+          optmization_op_list: a list of optimization operator that will update
+          parameter using gradient.
         """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters.
+
+        # Create any accumulators
+        self._create_accumulators(loss.block,
+                                  [p[0] for p in parameters_and_grads])
+        # Create any necessary tensors
+        self._initialize_tensors(loss.block)
+
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
             if param_and_grad[1] is not None:
                 optimize_op = self._append_optimize_op(loss.block,
                                                        param_and_grad)
                 optimize_ops.append(optimize_op)
+
         return optimize_ops
 
     def minimize(self, loss, parameter_list=None, no_grad_set=None):
@@ -92,33 +180,152 @@ class SGDOptimizer(Optimizer):
 
     def __init__(self, learning_rate):
         assert learning_rate is not None
-        super(Optimizer, self).__init__()
+        super(SGDOptimizer, self).__init__()
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _append_optimize_op(self, block, param_and_grad):
+    def _initialize_tensors(self, block):
         assert isinstance(block, framework.Block)
         lr_shape = [1]
-        # create a var for learning_rate
-        lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0)
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
 
         # create an op to init the learning_rate
-        init_op = block.append_op(
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
             type="fill_constant",
-            outputs={"Out": lr},
+            outputs={"Out": self._lr},
             attrs={"shape": lr_shape,
                    "value": self._learning_rate})
 
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": lr
+                "LearningRate": self._lr
             },
-            outputs={"ParamOut": param_and_grad[0]},
-            attrs={"shape": [1],
-                   "value": self._learning_rate})
+            outputs={"ParamOut": param_and_grad[0]})
 
         return sgd_op
+
+
+class MomentumOptimizer(Optimizer):
+    """Simple Momentum optimizer with velocity state
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self, learning_rate, momentum):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(MomentumOptimizer, self).__init__()
+        self.type = "momentum"
+        self._learning_rate = learning_rate
+        self._momentum = momentum
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._velocity_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._lr
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={"mu": self._momentum})
+
+        return momentum_op
+
+
+class AdagradOptimizer(Optimizer):
+    """Simple Adagrad optimizer with moment state
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self, learning_rate, epsilon=1.0e-6):
+        assert learning_rate is not None
+        assert epsilon is not None
+        super(AdagradOptimizer, self).__init__()
+        self.type = "adagrad"
+        self._learning_rate = learning_rate
+        self._epsilon = epsilon
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._moment_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+
+        # create the adagrad optimizer op
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._lr
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon})
+
+        return adagrad_op
diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/framework/tests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..28433306d49112cc860f4ace9efca2b2d70deb3f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/.gitignore
@@ -0,0 +1 @@
+image/
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/framework/tests/test_adam_op.py
index ff6faafa6e2119fde11b9eb6cd2a65a75334ebe6..a0d6655d4cbcff8ed3d55df0f4e68fc6591fbb11 100644
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
@@ -33,14 +33,12 @@ class TestAdamOp1(OpTest):
 
         self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, moment2_out, beta1_pow_out, \
-            beta2_pow_out = adam_step(self.inputs, self.attrs)
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
-            'Beta1PowOut': beta1_pow_out,
-            'Beta2PowOut': beta2_pow_out,
             'ParamOut': param_out
         }
 
@@ -78,14 +76,12 @@ class TestAdamOp2(OpTest):
 
         attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, moment2_out, beta1_pow_out, \
-            beta2_pow_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
-            'Beta1PowOut': beta1_pow_out,
-            'Beta2PowOut': beta2_pow_out,
             'ParamOut': param_out
         }
 
@@ -127,14 +123,12 @@ class TestAdamOpMultipleSteps(OpTest):
 
     def test_check_output(self):
         for _ in range(self.num_steps):
-            param_out, moment1_out, moment2_out, beta1_pow_out, \
-                beta2_pow_out = adam_step(self.inputs, self.attrs)
+            param_out, moment1_out, \
+                moment2_out = adam_step(self.inputs, self.attrs)
 
             self.outputs = {
                 'Moment1Out': moment1_out,
                 'Moment2Out': moment2_out,
-                'Beta1PowOut': beta1_pow_out,
-                'Beta2PowOut': beta2_pow_out,
                 'ParamOut': param_out
             }
 
@@ -145,8 +139,10 @@ class TestAdamOpMultipleSteps(OpTest):
             self.inputs['Param'] = param_out
             self.inputs['Moment1'] = moment1_out
             self.inputs['Moment2'] = moment2_out
-            self.inputs['Beta1Pow'] = beta1_pow_out
-            self.inputs['Beta2Pow'] = beta2_pow_out
+
+            # Update powers of Beta1 and Beta2 for next time step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
+            self.inputs['Beta2Pow'] *= self.attrs['beta1']
 
             # Randomize gradient for next step
             self.inputs['Grad'] = np.random.uniform(
@@ -175,11 +171,9 @@ def adam_step(inputs, attributes):
 
     moment1_out = beta1 * moment1 + (1 - beta1) * grad
     moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-    beta1_pow_out = beta1_pow * beta1
-    beta2_pow_out = beta2_pow * beta2
-    lr_t = lr * np.sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
+    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
     param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
-    return param_out, moment1_out, moment2_out, beta1_pow_out, beta2_pow_out
+    return param_out, moment1_out, moment2_out
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py
index af81075d6ad508dcd473ed596b00b036d87d894f..8e5a15aa3d12bbaae99cae6fcb627a336e48f684 100644
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
@@ -31,14 +31,13 @@ class TestAdamaxOp1(OpTest):
 
         self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
 
-        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
-            self.inputs, self.attrs)
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                          self.attrs)
 
         self.outputs = {
             'ParamOut': param_out,
             'MomentOut': moment_out,
-            'InfNormOut': inf_norm_out,
-            'Beta1PowOut': beta1_pow_out
+            'InfNormOut': inf_norm_out
         }
 
     def test_check_output(self):
@@ -73,14 +72,12 @@ class TestAdamaxOp2(OpTest):
         }
 
         attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
-        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
-            self.inputs, attrs)
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
 
         self.outputs = {
             'ParamOut': param_out,
             'MomentOut': moment_out,
-            'InfNormOut': inf_norm_out,
-            'Beta1PowOut': beta1_pow_out
+            'InfNormOut': inf_norm_out
         }
 
     def test_check_output(self):
@@ -117,19 +114,15 @@ class TestAdamaxOpMultipleSteps(OpTest):
 
         self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
 
-        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
-            self.inputs, self.attrs)
-
     def test_check_output(self):
         for _ in range(self.num_steps):
-            param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
-                self.inputs, self.attrs)
+            param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                              self.attrs)
 
             self.outputs = {
                 'ParamOut': param_out,
                 'MomentOut': moment_out,
-                'InfNormOut': inf_norm_out,
-                'Beta1PowOut': beta1_pow_out
+                'InfNormOut': inf_norm_out
             }
 
             # Verify output for this step
@@ -139,7 +132,9 @@ class TestAdamaxOpMultipleSteps(OpTest):
             self.inputs['Param'] = param_out
             self.inputs['Moment'] = moment_out
             self.inputs['InfNorm'] = inf_norm_out
-            self.inputs['Beta1Pow'] = beta1_pow_out
+
+            # Update Beta1 Power accumulator for next step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
 
             # Randomize gradient for next step
             self.inputs['Grad'] = np.random.uniform(
@@ -167,11 +162,10 @@ def adamax_step(inputs, attributes):
 
     moment_out = beta1 * moment + (1 - beta1) * grad
     inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
-    beta1_pow_out = beta1_pow * beta1
-    lr_t = (lr / (1 - beta1_pow_out))
+    lr_t = (lr / (1 - beta1_pow))
     param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
 
-    return param_out, moment_out, inf_norm_out, beta1_pow_out
+    return param_out, moment_out, inf_norm_out
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 919b6c3f6745a9c6115e7af857c1a30354305f89..e1c45c2674ee9cc7c7240bdd67de05cb218ac287 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": False}
+        self.attrs = {"soft_label": False}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
index 2b01e43454e70c12b423db9925837cf336f79935..fa2ccd0c3b74a2ee8b8fd9eb8986cb79ff07c98e 100644
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
@@ -4,6 +4,12 @@ import unittest
 from paddle.v2.framework.op import Operator, DynamicRecurrentOp
 import numpy as np
 
+# for siplicity, just one level LoD
+lod_py = [[0, 4, 7, 9, 10]]
+input_dim = 30
+num_sents = len(lod_py[0]) - 1
+weight_dim = 15
+
 
 def create_tensor(scope, name, shape, np_data):
     tensor = scope.var(name).get_tensor()
@@ -12,6 +18,17 @@ def create_tensor(scope, name, shape, np_data):
     return tensor
 
 
+class PyRNNStep(object):
+    def __init__(self):
+
+        self.x = np.random.normal(size=(lod_py[0][-1],
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(num_sents,
+                                             input_dim)).astype("float32")
+
+
 class DynamicRecurrentOpTest(unittest.TestCase):
     '''
     Test RNNOp
@@ -23,17 +40,13 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         - U
     vars:
         - x
-    memories:
+    states:
         - h
     outputs:
        - h
     '''
 
-    # for siplicity, just one level LoD
-    lod_py = [[0, 4, 7, 9, 10]]
-    input_dim = 30
-    num_sents = len(lod_py[0]) - 1
-    weight_dim = 15
+    py = PyRNNStep()
 
     def forward(self):
         self.scope = core.Scope()
@@ -42,64 +55,55 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         self.create_step_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
         self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@mem")
+        state = self.rnnop.get_state("h@state")
         print 'state size: ', state.size()
 
         step_inputs = self.rnnop.get_step_input("x")
         print "x size ", step_inputs.size()
         for i in range(step_inputs.size()):
             print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@mem')
+        step_outputs = self.rnnop.get_step_output('h@state')
         print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@mem").get_tensor()
-
+        output = self.scope.find_var("h@state").get_tensor()
         print 'output', np.array(output).shape
 
     def create_global_variables(self):
-        x = np.random.normal(size=(self.lod_py[0][-1],
-                                   self.input_dim)).astype("float32")
-        W = np.random.normal(size=(self.input_dim,
-                                   self.input_dim)).astype("float32")
-        U = np.random.normal(size=(self.input_dim,
-                                   self.input_dim)).astype("float32")
-        h_boot = np.random.normal(size=(self.num_sents,
-                                        self.input_dim)).astype("float32")
         # create inlink
-        x_tensor = create_tensor(self.scope, "x",
-                                 [self.num_sents, self.input_dim], x)
-        x_tensor.set_lod(self.lod_py)
-        create_tensor(self.scope, "W", [self.input_dim, self.input_dim], W)
-        create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U)
-        create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim],
-                      h_boot)
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
         self.scope.var("step_scopes")
-        self.scope.var("h@mem")
+        self.scope.var("h@state")
 
     def create_rnn_op(self):
         # create RNNOp
         self.rnnop = DynamicRecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
-            step_net="stepnet",
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
             # outputs
-            outlinks=["h@mem"],
+            outputs=["h@state"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@mem"])
+            ex_states=["h@pre"],
+            states=["h@state"])
 
     def create_step_net(self):
-        stepnet = core.Net.create()
+        step_unit = core.Net.create()
         x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
         sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@mem")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            stepnet.append_op(op)
-        stepnet.complete_add_op(True)
-        self.rnnop.set_stepnet(stepnet)
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.rnnop.set_step_unit(step_unit)
 
     def test_forward(self):
         print 'test recurrent op forward'
@@ -107,5 +111,58 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         print 'pd_output', pd_output
 
 
+class RecurrentGradientOpTest(unittest.TestCase):
+    py = PyRNNStep()
+
+    def create_forward_op(self):
+        # create RNNOp
+        self.forward_op = DynamicRecurrentOp(
+            # inputs
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
+            # outputs
+            outputs=["h@state"],
+            step_scopes="step_scopes",
+            # attributes
+            ex_states=["h@pre"],
+            states=["h@state"])
+
+    def create_gradient_op(self):
+        a = set()
+        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
+
+    def create_step_net(self):
+        step_unit = core.Net.create()
+        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.forward_op.set_step_unit(step_unit)
+
+    def create_global_variables(self):
+        # create inlink
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
+        self.scope.var("step_scopes")
+        self.scope.var("h@state")
+
+    def test_grad(self):
+        self.scope = core.Scope()
+        self.create_forward_op()
+        self.create_global_variables()
+        self.create_step_net()
+        self.create_gradient_op()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
index f3101a709b8bcf58e8682ab3d0ca5217a7f3572d..57daddd5698f77527bc5b78c436065a851867ae0 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
@@ -92,5 +92,33 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
         }
 
 
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
+        }
+
+
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 1).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/framework/tests/test_executor_and_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f775711167ce0d210044ab4cb382db802f39a5
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
@@ -0,0 +1,36 @@
+import unittest
+from paddle.v2.framework.layers import mul, data
+import paddle.v2.framework.core as core
+from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.framework import g_program
+import numpy
+
+
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], data_type='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            data_type='float32',
+            append_batch_size=False)
+        out = mul(x=a, y=b)
+        place = core.CPUPlace()
+        a_np = numpy.random.random((100, 784)).astype('float32')
+        tensor_a = core.LoDTensor()
+        tensor_a.set(a_np, place)
+        b_np = numpy.random.random((784, 100)).astype('float32')
+        tensor_b = core.LoDTensor()
+        tensor_b.set(b_np, place)
+        exe = Executor(place)
+        outs = exe.run(g_program,
+                       feed={'a': tensor_a,
+                             'b': tensor_b},
+                       fetch_list=[out])
+        out = numpy.array(outs[0])
+        self.assertEqual((100, 100), out.shape)
+        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
index 47eedddcb6f47927ea3918d7f6c379c5710592c6..fbd659ece0188140e197982ea818d7c3897daf4e 100644
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
@@ -5,6 +5,7 @@ import numpy as np
 
 class TestFeedFetch(unittest.TestCase):
     def test_feed_fetch(self):
+        scope = core.Scope()
         place = core.CPUPlace()
         input_array = np.ones((4, 4, 6)).astype("float32")
         input_array[0, 0, 0] = 3
@@ -12,9 +13,9 @@ class TestFeedFetch(unittest.TestCase):
         input_tensor = core.LoDTensor([[0, 2, 4]])
         input_tensor.set(input_array, place)
 
-        core.set_feed_variable_float(input_tensor, "feed", 0)
+        core.set_feed_variable(scope, input_tensor, "feed", 0)
 
-        output_tensor = core.get_fetch_variable("feed", 0)
+        output_tensor = core.get_fetch_variable(scope, "feed", 0)
 
         output_lod = output_tensor.lod()
         self.assertEqual(0, output_lod[0][0])
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20e3357894c2bacad83f0a99632710c586602de
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
@@ -0,0 +1,73 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+x = layers.data(
+    name='x',
+    shape=[13],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+y_predict = layers.fc(input=x,
+                      size=1,
+                      act=None,
+                      program=program,
+                      init_program=init_program)
+
+y = layers.data(
+    name='y',
+    shape=[1],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+cost = layers.square_error_cost(
+    input=y_predict, label=y, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
+
+        tensor_x = core.LoDTensor()
+        tensor_x.set(x_data, place)
+        # print tensor_x.get_dims()
+
+        tensor_y = core.LoDTensor()
+        tensor_y.set(y_data, place)
+        # print tensor_y.get_dims()
+        outs = exe.run(program,
+                       feed={'x': tensor_x,
+                             'y': tensor_y},
+                       fetch_list=[avg_cost])
+        out = np.array(outs[0])
+
+        if out[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_increment_op.py b/python/paddle/v2/framework/tests/test_increment_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e174272b05b9413cc2bc1e099c4dd17899829e76
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_increment_op.py
@@ -0,0 +1,41 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestIncrementOpPositiveStep(OpTest):
+    """Test increment op with positive step
+    """
+
+    def setUp(self):
+        self.op_type = "increment"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.attrs = {'step': 14.8}
+        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestIncrementOpNegativeStep(OpTest):
+    """Test increment op with negative step
+    """
+
+    def setUp(self):
+        self.op_type = "increment"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.attrs = {'step': -3.8}
+        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
index dbbb6535389e2336d156734cc672e0cc7bba175c..4ecc02b12d8db53e897dea10186bc053d05be303 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -1,4 +1,5 @@
 import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
 from paddle.v2.framework.framework import Program, g_program
 import paddle.v2.framework.core as core
 import unittest
@@ -18,7 +19,7 @@ class TestBook(unittest.TestCase):
 
         avg_cost = layers.mean(x=cost, program=program)
         self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost, set())
+        program.append_backward(avg_cost)
         print str(program)
 
     def test_recognize_digits_mlp(self):
@@ -38,24 +39,52 @@ class TestBook(unittest.TestCase):
         cost = layers.cross_entropy(input=predict, label=label, program=program)
         avg_cost = layers.mean(x=cost, program=program)
         self.assertIsNotNone(avg_cost)
-        # print str(program)
+        print str(program)
 
     def test_simple_conv2d(self):
-        pd = core.ProgramDesc.__create_program_desc__()
-        program = Program(desc=pd)
-        images = data_layer(
+        program = Program()
+        images = layers.data(
             name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
-        conv2d_layer(
+        layers.conv2d(
             input=images, num_filters=3, filter_size=[4, 4], program=program)
 
-        # print str(program)
+        print str(program)
 
-    def test_simple_conv2d(self):
+    def test_recognize_digits_conv(self):
         program = Program()
+
         images = layers.data(
-            name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
-        layers.conv2d(
-            input=images, num_filters=3, filter_size=[4, 4], program=program)
+            name='pixel',
+            shape=[1, 28, 28],
+            data_type='float32',
+            program=program)
+        label = layers.data(
+            name='label', shape=[1], data_type='int32', program=program)
+        conv_pool_1 = nets.simple_img_conv_pool(
+            input=images,
+            filter_size=5,
+            num_filters=2,
+            pool_size=2,
+            pool_stride=2,
+            act="relu",
+            program=program)
+        conv_pool_2 = nets.simple_img_conv_pool(
+            input=conv_pool_1,
+            filter_size=5,
+            num_filters=4,
+            pool_size=2,
+            pool_stride=2,
+            act="relu",
+            program=program)
+
+        predict = layers.fc(input=conv_pool_2,
+                            size=10,
+                            act="softmax",
+                            program=program)
+        cost = layers.cross_entropy(input=predict, label=label, program=program)
+        avg_cost = layers.mean(x=cost, program=program)
+
+        program.append_backward(avg_cost)
 
         print str(program)
 
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/framework/tests/test_momentum_op.py
index d3353ff6e4f4da32eaefdd4e816a621ddac8bece..654d31975aab4578055e7e70ade202bd2c3d93cb 100644
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestMomentumOp(OpTest):
+class TestMomentumOp1(OpTest):
     def setUp(self):
         self.op_type = "momentum"
 
@@ -12,6 +12,7 @@ class TestMomentumOp(OpTest):
         velocity = np.zeros((123, 321)).astype("float32")
         learning_rate = np.array([0.001]).astype("float32")
         mu = 0.0001
+        use_nesterov = False
 
         self.inputs = {
             'Param': param,
@@ -23,7 +24,47 @@ class TestMomentumOp(OpTest):
         self.attrs = {'mu': mu}
 
         velocity_out = mu * velocity + grad
-        param_out = param - learning_rate * velocity_out
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMomentumOp2(OpTest):
+    '''Test Momentum with defaukt values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        use_nesterov = True
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu, 'useNesterov': use_nesterov}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
diff --git a/python/paddle/v2/framework/tests/test_op_support_gpu.py b/python/paddle/v2/framework/tests/test_op_support_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd36c666c440a5c378dfceac4502cd8277417412
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
@@ -0,0 +1,11 @@
+import unittest
+import paddle.v2.framework.core as core
+
+
+class TestOpSupportGPU(unittest.TestCase):
+    def test_case(self):
+        self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py
index 3d6fa70737bf360df53785dc602feceda471ee70..3d1715bf627fc018856b80e0e8ff962a7922f193 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
@@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer
 
 class TestOptimizer(unittest.TestCase):
     def test_sgd_optimizer(self):
-        program = framework.g_program
+        program = framework.Program()
         block = program.global_block()
         mul_x = block.create_parameter(
             dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
@@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase):
             dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
         mul_out = block.create_var(
             dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
+        block.append_op(
             type="mul",
             inputs={"X": mul_x,
                     "Y": mul_y},
@@ -27,5 +27,88 @@ class TestOptimizer(unittest.TestCase):
         self.assertEqual(sgd_op.type, "sgd")
 
 
+class TestMomentumOptimizer(unittest.TestCase):
+    class MockMomentum(optimizer.MomentumOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_velocity_str(self):
+            return self._velocity_acc_str
+
+    def test_momentum_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
+        params_grads = momentum_optimizer.create_backward_pass(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(params_grads,
+                                                           mul_out)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+
+class TestAdagradOptimizer(unittest.TestCase):
+    class MockAdagrad(optimizer.AdagradOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+    def test_adagrad_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        adagrad_optimizer = self.MockAdagrad(learning_rate=0.01, epsilon=1.0e-6)
+        params_grads = adagrad_optimizer.create_backward_pass(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
+        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out)
+        self.assertEqual(len(opts), 1)
+        adagrad_op = opts[0]
+        self.assertEqual(adagrad_op.type, "adagrad")
+
+        # check accumulators
+        accumulators = adagrad_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
+        moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b305213df424dd097bf4238aa14320a2f7da45d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -0,0 +1,92 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+
+images = layers.data(
+    name='pixel',
+    shape=[1, 28, 28],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+label = layers.data(
+    name='label',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+conv_pool_1 = nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu",
+    program=program,
+    init_program=init_program)
+conv_pool_2 = nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu",
+    program=program,
+    init_program=init_program)
+
+predict = layers.fc(input=conv_pool_2,
+                    size=10,
+                    act="softmax",
+                    program=program,
+                    init_program=init_program)
+cost = layers.cross_entropy(
+    input=predict, label=label, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 50
+PASS_NUM = 1
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+for pass_id in range(PASS_NUM):
+    count = 0
+    for data in train_reader():
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = y_data.reshape([BATCH_SIZE, 1])
+
+        tensor_img = core.LoDTensor()
+        tensor_y = core.LoDTensor()
+        tensor_img.set(img_data, place)
+        tensor_y.set(y_data, place)
+
+        outs = exe.run(program,
+                       feed={"pixel": tensor_img,
+                             "label": tensor_y},
+                       fetch_list=[avg_cost])
+
+        loss = np.array(outs[0])
+
+        if loss < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a985d1f3d38fcaa8372a70edd519b873d47f554a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -0,0 +1,83 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+image = layers.data(
+    name='x',
+    shape=[784],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+hidden1 = layers.fc(input=image,
+                    size=128,
+                    act='relu',
+                    program=program,
+                    init_program=init_program)
+hidden2 = layers.fc(input=hidden1,
+                    size=64,
+                    act='relu',
+                    program=program,
+                    init_program=init_program)
+
+predict = layers.fc(input=hidden2,
+                    size=10,
+                    act='softmax',
+                    program=program,
+                    init_program=init_program)
+
+label = layers.data(
+    name='y',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+
+cost = layers.cross_entropy(
+    input=predict, label=label, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 128
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = np.expand_dims(y_data, axis=1)
+
+        tensor_x = core.LoDTensor()
+        tensor_x.set(x_data, place)
+
+        tensor_y = core.LoDTensor()
+        tensor_y.set(y_data, place)
+
+        outs = exe.run(program,
+                       feed={'x': tensor_x,
+                             'y': tensor_y},
+                       fetch_list=[avg_cost])
+        out = np.array(outs[0])
+        if out[0] < 5.0:
+            exit(0)  # if avg cost less than 5.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 191ce0b0c8d5fb6c4d8037a6c1bfda57c394489e..cc4008c0d8e73a3f7d9a9be2a4aacfd120ecd522 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -132,15 +132,15 @@ class RecurrentOpTest(unittest.TestCase):
         # create RNNOp
         self.rnnop = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h@mem"],
+            outputs=["h@mem"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@mem"])
+            ex_states=["h@pre"],
+            states=["h@mem"])
 
     def create_step_net(self):
         stepnet = core.Net.create()
@@ -169,15 +169,15 @@ class RecurrentGradientOpTest(unittest.TestCase):
     def create_forward_op(self):
         self.forward_op = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h"],
+            outputs=["h"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@alias"])
+            ex_states=["h@pre"],
+            states=["h@alias"])
 
         # create a stepnet for RNN
         stepnet = core.Net.create()
diff --git a/python/paddle/v2/framework/tests/test_reduce_op.py b/python/paddle/v2/framework/tests/test_reduce_op.py
index 70359d60cbe656150877673c63e81eae92d8ab9a..0fec31c2e22e1eda2c62aa9b38487d703815f414 100644
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
@@ -85,5 +85,33 @@ class Test1DReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestNorm(OpTest):
+    def setUp(self):
+        # use x away from 0 to avoid errors of numerical gradient when gradient near 0
+        x = np.random.random((5, 6, 10)).astype("float32") + 0.2
+        p = 2
+        dim = 1
+        keep_dim = False
+        abs_out = np.absolute(x)
+        pow_out = np.power(x, p)
+        sum_out = np.sum(pow_out, axis=dim, keepdims=keep_dim)
+        out = np.power(sum_out, 1. / p)
+        self.op_type = "norm"
+        self.inputs = {'X': x}
+        self.attrs = {"p": p, "dim": dim, "keep_dim": keep_dim}
+        self.outputs = {
+            "AbsOut": abs_out,
+            "PowOut": pow_out,
+            "SumOut": sum_out,
+            "Out": out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/framework/tests/test_rmsprop_op.py
index 3e5ff733e9b55fe8c9727e9721e25083a494be15..237bcfccceee89f62fc05e4c6c972a76d1875367 100644
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
@@ -46,7 +46,7 @@ class TestRmspropOp1(OpTest):
 
 
 class TestRmspropOp2(OpTest):
-    '''Test RMSProp with defaukt values for attributes
+    '''Test RMSProp with default values for attributes
     '''
 
     def setUp(self):
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
index a2d28a65a67b03a6c74348b19ba99cffc55738e9..ded777105e0fc64eb82bf4013bfba7ba9d0ddefa 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -19,7 +19,7 @@ class TestUniformRandomOp(unittest.TestCase):
         op = Operator(
             "uniform_random",
             Out='X',
-            dims=[1000, 784],
+            shape=[1000, 784],
             min=-5.0,
             max=10.0,
             seed=10)
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
index 83da678da387ed1c86868847f140c6c09fbec3b5..63905c04cf737d0f1d226a4a5a27777351dbf5a3 100644
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -97,7 +97,7 @@ class DataFeederTest(unittest.TestCase):
             each_sample.append(zip(a, b))
             data.append(each_sample)
 
-        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+        feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
                             {'input': 0})
         arg = feeder(data)
         output = arg.getSlotValue(0)