diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 1329b77bb44f52c66a703740715b890c47234e72..c94627a72806fa2eca77c79da24f7f3ca18f0259 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -434,9 +434,9 @@ lambda_cost
 ..  autoclass:: paddle.v2.layer.lambda_cost
     :noindex:
 
-mse_cost
+square_error_cost
 --------
-..  autoclass:: paddle.v2.layer.mse_cost
+..  autoclass:: paddle.v2.layer.square_error_cost
     :noindex:
 
 rank_cost
diff --git a/doc/design/graph.md b/doc/design/graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..51b7f87638f8ddff752328a562fe0dd0fe56cfd1
--- /dev/null
+++ b/doc/design/graph.md
@@ -0,0 +1,70 @@
+# Design Doc: Computations as a Graph
+
+A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
+
+This document explains that the construction of a graph as three steps:
+
+- construct the forward part
+- construct the backward part
+- construct the optimization part
+
+## The Construction of a Graph
+
+Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+### Forward Part
+
+The first four lines of above program build the forward part of the graph.
+
+![](images/graph_construction_example_forward_only.png)
+
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
+
+In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
+
+### Backward Part
+
+The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
+
+`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
+
+![](images/graph_construction_example_forward_backward.png)
+
+According to the chain rule of gradient computation, `ConstructBackwardGraph` would
+
+1. create a gradient operator G for each operator F,
+1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
+1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
+1. make all these gradients as outputs of G.
+
+### Optimization Part
+
+For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
+
+![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/design/images/graph_construction_example.bash b/doc/design/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/design/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8d1b673abf6b78c851676fa379dc850c4818f0e5
--- /dev/null
+++ b/doc/design/images/graph_construction_example.dot
@@ -0,0 +1,69 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        x -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..181187503472d15779b87284105841168b3945c4
Binary files /dev/null and b/doc/design/images/graph_construction_example_all.png differ
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..3049a9315fd616464dec54e33064cb75598ca536
Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..25d19088cbf0b5f68cf734f2ff21eba8af4a2860
Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_only.png differ
diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
index 5e07c29c56d21728599195d420d3222213d77e7c..fded4a68612396a262121a5a886a8ae573dfa662 100644
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@@ -147,7 +147,7 @@ class CosineOp {
 struct CosineOpProtoMaker : public OpProtoMaker {
 	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
 		AddInput("input", "input of cosine op");
-		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
 		AddType("cos");
 		AddComment("This is cos op");
 	}
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index 428f58830e0b10c024f31238b7404c6df193eecd..b473944fc7fb89d3e0a0b330933f2226734bb5bd 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = mse_cost(input= ȳ, label=y)
+    cost = square_error_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index 6775da20c2f51000f305b095d40abd27b8fa6c0e..2cc438ebbe0f97345d25354b93b4ebbd43502415 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = mse_cost(input=y_predict, label=y)
+        cost = square_error_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 7e604f23de38543a00f305d508af0791193f78ba..8aceb23406a476f08639cc6223cdf730b728a705 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -8,7 +8,7 @@ paddle.init(use_gpu=False)
 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
 y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 
 # create parameters
 parameters = paddle.parameters.create(cost)
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index f15b11bd780402a3ec1755900e8c648f5d2a7bc5..c243083794bb3c4659242de99b3b2715af9d7c24 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..	code-block:: bash
 
     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+    cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 
-其中，x与y为之前描述的输入层；而y_predict是接收x作为输入，接上一个全连接层；cost接收y_predict与y作为输入，接上均方误差层。
+其中，x与y为之前描述的输入层；而y_predict是接收x作为输入，接上一个全连接层；cost接收y_predict与y作为输入，接上平方误差层。
 
 最后一层cost中记录了神经网络的所有拓扑结构，通过组合不同的layer，我们即可完成神经网络的搭建。
 
@@ -147,4 +147,4 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
     :linenos:
 
-有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
\ No newline at end of file
+有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index ec79b7f42b2d70df8fcb25faca5bc3a4759e177c..58665e9f2b6299ec3959ed6858ab01d459f64dd8 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -23,17 +23,20 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
 
-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-  
- 内容            | 定义位置         
---------------  | :----------------------  
+
+ 内容            | 定义位置
+--------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件 
-Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
-注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
-     
-     
+Op定义           | `.cc`文件
+Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+
+
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+
+
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
@@ -42,9 +45,11 @@ Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`
 
 ### 1. 定义ProtoMaker类
 
-矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
-    
-```
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
+
+```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -59,20 +64,20 @@ The equation is: Out = X * Y
   }
 };
 ```
-   
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
-   
-构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
 
-在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-   
-再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
-   
-```
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+
+```cpp
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -87,17 +92,19 @@ The equation is: Out = scale*X
   }
 };
 ```
- 
- 在这个例子里，两处不同：
- 
-  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
-  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
-   
+
+这个例子有两处不同：
+
+- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+
 
 ### 2. 定义Operator类
 
+下面的点实现了MulOp的定义：
 
-```c++
+```cpp
 class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -121,33 +128,46 @@ class MulOp : public framework::OperatorWithKernel {
 ```
 
 [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
-	 
-```c++
+
+```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
 ```
 
 这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
-   
-```c++
+
+```cpp
 MulOp(const std::string &type, const framework::VariableNameMap &inputs,
       const framework::VariableNameMap &outputs,
       const framework::AttributeMap &attrs)
   : OperatorWithKernel(type, inputs, outputs, attrs) {}
-```	
-	
+```
+
 还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
 
   - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
   - 2). 设置输出Tensor的形状。
 
-通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 
 ### 3. 定义OpKernel类
 
-```C++
-template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
- public:
+`MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+
+- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型，如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现：
+
+  ```cpp
+  template <typename Place, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<Tensor>("X");
     auto* Y = context.Input<Tensor>("Y");
@@ -157,168 +177,197 @@ class MulKernel : public framework::OpKernel {
         const_cast<platform::DeviceContext*>(context.device_context_);
     math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
-};
-```
+  };
+  ```
+
+需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+
+
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
-`MulKernel`继承自`framework::OpKernel`，带有模板参数:
-
-  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-  
- - `typename T` : 表示数据类型，如`float`, `double`等。
-   
-`MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
-   
-注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
-   
-到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
-   
 ### 4. 注册Operator
 
-在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
 
-```c++
-namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
-```
-    
-  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，并且注册`ops::MulOpGrad`为其反向Op。
-  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
-
-在 `.cu`文件中注册GPU Kernel。
-   
-```c++
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
-```
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+    ```
+
+   在上面的代码中：
+
+    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+
+
+- 在 `.cu`文件中注册GPU Kernel。
+    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_GPU_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    ```
 
 ### 5. 编译
 
-在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
-   
-```
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-```
-   
-下面命令可以编译：
-   
-```
-make mul_op
-```
+- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
+- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如，`mul_op` 依赖 `math_function`，需要在`CMakeLists.txt`中添加如下内容：
+
+    ```
+    op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)		 +
+    ```
+
+- 运行下面命令可以进行编译：
+
+    ```
+    make mul_op
+    ```
 
 ## 绑定Python
 
-- 绑定Python 
- 
-    在 [`paddle/pybind/pybind.cc 
-`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+- 绑定Python
+
+    在 [`paddle/pybind/pybind.cc
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
 
     ```
     USE_OP(mul);
     ```
     如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
-    
+
     ```
     USE_CPU_ONLY_OP(gather);
     ```
-    
-    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
-    
-    
+
+    如果OP不带Kernel，则使用`USE_NO_KENREL_OP`:
+
+    ```
+    USE_NO_KENREL_OP(recurrent);
+    ```
+
+
  - 生成库
 
-   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
-   
-   ```
-   if(WITH_PYTHON)
-     cc_library(paddle_pybind SHARED
-     SRCS pybind.cc
-     DEPS pybind python backward
-     mul_op
-     minus_op)
-   endif(WITH_PYTHON)
-   ```
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单测
+### 前向Operator单元测试
 
-前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
 
-```
-import unittest
-import numpy as np
-from gradient_checker import GradientChecker, create_op
-from op_test_util import OpTestMeta
+1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+
+
+  ```python
+  import unittest
+  import numpy as np
+  from gradient_checker import GradientChecker, create_op
+  from op_test_util import OpTestMeta
 
-class TestMulOp(unittest.TestCase):
-    __metaclass__ = OpTestMeta
+  class TestMulOp(unittest.TestCase):
+      __metaclass__ = OpTestMeta
 
+      def setUp(self):
+          self.type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+  ```
+
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
+
+- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+
+
+### 反向Operator单元测试
+
+反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+
+```python
+class TestMulGradOp(GradientChecker):
     def setUp(self):
-        self.type = "mul"
+        self.op = create_op("mul")
         self.inputs = {
             'X': np.random.random((32, 84)).astype("float32"),
             'Y': np.random.random((84, 100)).astype("float32")
         }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-```
-   首先需要`import`必要的包,下面详细解释其他值：
-   
-   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
-   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
-   - `self.outputs` : 定义输出，并得到Python结算结果。
-
- 
-### 反向Operator单测
-
-反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
-
- ```
- class MulGradOpTest(GradientChecker):
-    def test_mul(self):
-        op = create_op("mul")
-        inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-        self.compare_grad(op, inputs)      
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
         # mul op will enlarge the relative error
         self.check_grad(
-            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
- ```
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"Y"})
+```
+
+下面解释代码中一些关键的地方:
 
-   - 调用`create_op("mul")`创建反向Op对应的前向Op。
-   - 定义输入`inputs`。
-   - 调用`compare_grad`函数对比CPU、GPU计算结果。
-   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
-      - 第一个参数`op` : 前向op。
-      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
-      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
-      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+- 调用`create_op("mul")`创建反向Op对应的前向Op。
+- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
-### 编译和执行 
+### 编译和执行单元测试
 
-单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+单元测试编写完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容，将单元测试加入工程：
 
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
 
-编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
-```
+```bash
 make test ARGS="-R test_mul_op -V"
 ```
+
 或者:
 
-```
+```bash
 ctest -R test_mul_op
 ```
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/howto/dev/use_eigen_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..1367323b71277984834d9d4f0d9bea0f69478479
--- /dev/null
+++ b/doc/howto/dev/use_eigen_cn.md
@@ -0,0 +1,146 @@
+## 在Paddle中如何使用Eigen
+
+神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
+
+
+### Eigen Tensor模块
+
+Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
+
+关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+
+
+### paddle::framework::Tensor
+
+Paddle Tensor定义在framework目录下，其主要接口如下：
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+  
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+  
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+  
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+  
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:  
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+  
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder`的作用是延迟分配内存，即我们可以先定义一个Tensor，然后使用Resize接口设置Tensor的大小，最后再调用mutable_data接口分配实际的内存。
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor使用样例
+下面以AddOp为例说明Tensor的使用过程：
+
+- InferShape
+
+在运行神经网络计算图时，我们先调用每个`Operator`的`InferShape`接口，根据输入Tensor的大小来设置输出Tensor的大小，`Resize`接口会被调用。
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口，在这时真正的分配内存，`mutable_data`接口会被调用。
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+### paddle::framework::Tensor到EigenTensor的转换
+
+如上一小节所示，在具体的计算中，我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+
+以EigenTensor为例，做一个介绍
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+From是EigenTensor模板提供的一个接口，可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数，因此在转换时需要显示的指定。
+
+在Eigen中，不同rank的Tensor是不同类型，Vector是rank为1的Tensor。需要额外注意的是，EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor，在这里用EigenVector来表示；而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作，压扁成为Eigen的一维Tensor，类型仍然为EigenVector。
+
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。
+
+
+
+### 实现计算
+
+当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+在这段代码中，input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口，把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后，input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息，可以调用Resize接口进行改变。
+
+由于Eigen Tensor模块的文档较少，我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 3121b3f59df650c0a22d0bd305a6f793b202d30e..a9bebf09558b06993119803458977abedbbfbdd0 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -213,7 +213,7 @@ I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 9eb07acdff1d00dd926f1cee9c24f9f151006d7e..27132eaa0b3b0666fc042faf052dac2e169ba9e7 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -43,6 +43,10 @@ template <>
 AttrType AttrTypeID<std::vector<std::string>>() {
   return STRINGS;
 }
+template <>
+AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
+  return INT_PAIRS;
+}
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
@@ -76,6 +80,14 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       }
       return val;
     }
+    case paddle::framework::AttrType::INT_PAIRS: {
+      std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
+      for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
+        val[i].first = attr_desc.int_pairs(i).first();
+        val[i].second = attr_desc.int_pairs(i).second();
+      }
+      return val;
+    }
   }
   PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
   return boost::blank();
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 08b47cabd4c2225c50022bd35734dcc2663324d6..cde3dfa1d3d19b1bee9fd23dad52ecbbe628c3a9 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -28,7 +28,8 @@ namespace paddle {
 namespace framework {
 
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>>
+                       std::vector<float>, std::vector<std::string>,
+                       std::vector<std::pair<int, int>>>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
@@ -40,9 +41,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
 
 // check whether a value(attribute) fit a certain limit
 template <typename T>
-class LargerThanChecker {
+class GreaterThanChecker {
  public:
-  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
   void operator()(T& value) const {
     PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
   }
@@ -109,8 +110,8 @@ class TypedAttrChecker {
     return *this;
   }
 
-  TypedAttrChecker& LargerThan(const T& lower_bound) {
-    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
     return *this;
   }
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 6b4c612cd8d9263258e3987914c44002e7bca92c..c5d46622156c56acb98fb77e7db5ee7bca8c937a 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -182,7 +182,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                    });
 
     // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "recurrent_op") {
+    if (forwardOp.Type() == "recurrent") {
       // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
       // this will result in infinite loop.
       const auto& rnnop =
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
index 9500c92a265d60a696e1e2c422d0f2bd1621ef71..8aa6728a95bc464ab8884986f0cec6c817d3303b 100644
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@@ -18,7 +18,7 @@ A backward network is built up with several backward operators. Backward operato
 For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
 
 ```cpp
-REGISTER_OP(mul, MulOp, MulOpMaker, MulOpGrad);
+REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
 ```
 
 `mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index bf8b11e5f5ae801621f84bdbeffb5c4cf2dd8905..ad8003420dc14538d0dae9a1cb19d6459b154576 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -148,14 +148,16 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
 namespace f = paddle::framework;
 namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
-REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, f::NOP);
-REGISTER_OP(mul, f::NOP, f::MulOpMaker, f::NOP);
-REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, f::NOP);
+REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
+            f::NOP);
+REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
+REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, f::NOP);
+REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, f::NOP);
+REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
+            f::NOP);
 
 TEST(Backward, simple_op_grad) {
   auto fwd = f::OpRegistry::CreateOp(
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index cfd3e8dfdec0e92620aef5cd246b4622b779ce19..85b7de79743bb0390d66b8999f2e8342a51d14a9 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -21,16 +21,16 @@ namespace framework {
 /// @cond HIDDEN
 
 template <int i>
-Dim<i> make_dim(const int* d) {
+Dim<i> make_dim(const int64_t* d) {
   return Dim<i>(*d, make_dim<i - 1>(d + 1));
 }
 
 template <>
-Dim<1> make_dim<1>(const int* d) {
+Dim<1> make_dim<1>(const int64_t* d) {
   return Dim<1>(*d);
 }
 
-void make_ddim(DDim& ddim, const int* dims, int n) {
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
   switch (n) {
     case 1:
       ddim = make_dim<1>(dims);
@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
 
 /// @endcond
 
-DDim make_ddim(std::initializer_list<int> dims) {
+DDim make_ddim(std::initializer_list<int64_t> dims) {
   DDim result(make_dim(0));
   make_ddim(result, dims.begin(), dims.size());
   return result;
 }
 
-DDim make_ddim(const std::vector<int>& dims) {
+DDim make_ddim(const std::vector<int64_t>& dims) {
   DDim result(make_dim(0));
   make_ddim(result, &dims[0], dims.size());
   return result;
@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
 
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int&> {
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
  public:
   explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int& operator()(Dim<D>& dim) const {
+  int64_t& operator()(Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
   int idx_;
 };
 
-class DynamicConstIndexer : public boost::static_visitor<int> {
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
  public:
   explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int operator()(const Dim<D>& dim) const {
+  int64_t operator()(const Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
 
 /// @endcond
 
-int& DDim::operator[](int idx) {
+int64_t& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
 }
 
-int DDim::operator[](int idx) const {
+int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-ssize_t DDim::size() const { return arity(*this); }
+int64_t DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
   } else {
-    std::vector<int> v1 = vectorize(*this);
-    std::vector<int> v2 = vectorize(d);
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
 
     for (unsigned int i = 0; i < v1.size(); i++) {
       if (v1[i] != v2[i]) {
@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
 bool DDim::operator!=(DDim d) const { return !(*this == d); }
 
 DDim DDim::operator+(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
 }
 
 DDim DDim::operator*(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
   return make_ddim(v3);
 }
 
-int get(const DDim& ddim, int idx) { return ddim[idx]; }
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
 /// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
 
-  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 };
 /// @endcond
 
-std::vector<int> vectorize(const DDim& ddim) {
-  std::vector<int> result;
+std::vector<int64_t> vectorize(const DDim& ddim) {
+  std::vector<int64_t> result;
   VectorizeVisitor visitor(result);
   boost::apply_visitor(visitor, ddim);
   return result;
 }
 
-struct ProductVisitor : public boost::static_visitor<ssize_t> {
+struct ProductVisitor : public boost::static_visitor<int64_t> {
   template <int D>
-  ssize_t operator()(const Dim<D>& dim) {
+  int64_t operator()(const Dim<D>& dim) {
     return product(dim);
   }
 };
 
-ssize_t product(const DDim& ddim) {
+int64_t product(const DDim& ddim) {
   ProductVisitor visitor;
   return boost::apply_visitor(visitor, ddim);
 }
 
 struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
   int begin;
   int end;
 
-  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
       : vector(v), begin(b), end(e) {
     PADDLE_ENFORCE(begin < end,
                    "Begin index must be less than end index in ddim slice.");
@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
 };
 
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int> vec;
+  std::vector<int64_t> vec;
   vec.reserve(end - begin);
   SliceVectorizeVisitor visitor(vec, begin, end);
   boost::apply_visitor(visitor, dim);
@@ -280,7 +280,7 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
-DDim::DDim(std::initializer_list<int> init_list) {
+DDim::DDim(std::initializer_list<int64_t> init_list) {
   *this = make_ddim(init_list);
 }
 }  // namespace framework
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 95f294b62737be5c3eac39303148ac35da29fe7d..db30c523948b1d437615aa0e9bfecb5e25569296 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -40,7 +40,7 @@ struct DDim {
   template <int D>
   explicit DDim(const Dim<D>& in) : var(in) {}
 
-  /*implicit*/ DDim(std::initializer_list<int> init_list);
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -48,8 +48,8 @@ struct DDim {
     return *this;
   }
 
-  int& operator[](int idx);
-  int operator[](int idx) const;
+  int64_t& operator[](int idx);
+  int64_t operator[](int idx) const;
 
   template <typename Visitor>
   typename Visitor::result_type apply_visitor(Visitor& visitor) {
@@ -71,15 +71,15 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  ssize_t size() const;
+  int64_t size() const;
 };
 
 /**
- * \brief Make a DDim from std::vector<int>
+ * \brief Make a DDim from std::vector<int64_t>
  *
  * \param dims An vector of ints. Must be sized between [1, 9]
  */
-DDim make_ddim(const std::vector<int>& dims);
+DDim make_ddim(const std::vector<int64_t>& dims);
 
 /**
  * \brief Make a DDim from an initializer list
@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
  * \param dims An initializer list of ints. Must be sized between [1, 9]
  *
  */
-DDim make_ddim(std::initializer_list<int> dims);
+DDim make_ddim(std::initializer_list<int64_t> dims);
 
-int get(const DDim& dim, int idx);
+int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);
 
-std::vector<int> vectorize(const DDim& ddim);
+std::vector<int64_t> vectorize(const DDim& ddim);
 
-ssize_t product(const DDim& ddim);
+int64_t product(const DDim& ddim);
 
 /**
  * \brief Slice a ddim
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 9d18a2972ce62139430b240b4599854b14290a32..756232b1b56a49d2c91cc2cac950ca508c54fb3f 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(ddim[2], 5);
 
   // construct a DDim from a vector
-  std::vector<int> vec({9, 1, 5});
+  std::vector<int64_t> vec({9, 1, 5});
   paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
 
   // vectorize a DDim
-  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
   EXPECT_EQ(res_vec[0], 9);
   EXPECT_EQ(res_vec[1], 1);
   EXPECT_EQ(res_vec[2], 5);
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 883fdc55eb929ebc51e8ae05938e9d07374406ce..04d4b0e604e6f73ad94e0ca79d6b69f663bd4076 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -17,13 +17,13 @@ struct Dim {
   static constexpr int dimensions = i;
 
   template <typename... Args>
-  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
     static_assert(sizeof...(_tail) == i - 1,
                   "Dim initialized with the wrong number of parameters");
   }
 
   HOSTDEVICE
-  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
 
   HOSTDEVICE
   Dim() : head(0), tail() {}
@@ -31,12 +31,12 @@ struct Dim {
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
   HOSTDEVICE
-  Dim(int idx, const Dim<i>& size)
+  Dim(int64_t idx, const Dim<i>& size)
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
   HOSTDEVICE
-  Dim(int idx) : head(idx), tail(idx) {}
+  Dim(int64_t idx) : head(idx), tail(idx) {}
 
   HOSTDEVICE
   bool operator==(const Dim<i>& o) const {
@@ -47,13 +47,13 @@ struct Dim {
   bool operator!=(const Dim<i>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
   HOST std::string to_string() const;
 
-  int head;
+  int64_t head;
   Dim<i - 1> tail;
 };
 
@@ -63,7 +63,7 @@ struct Dim<1> {
   static constexpr int dimensions = 1;
 
   HOSTDEVICE
-  Dim(int _head) : head(_head) {}
+  Dim(int64_t _head) : head(_head) {}
 
   HOSTDEVICE
   Dim() : head(0) {}
@@ -86,11 +86,11 @@ struct Dim<1> {
   bool operator!=(const Dim<1>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
-  int head;
+  int64_t head;
 };
 
 namespace {
@@ -100,12 +100,12 @@ template <int i>
 struct DimGetter {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
 };
@@ -115,18 +115,18 @@ template <>
 struct DimGetter<0> {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return d.head;
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return d.head;
   }
 };
 
 template <int D>
-HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -141,7 +141,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -153,7 +153,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
 }
 
 template <int D>
-HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -168,7 +168,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -182,73 +182,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int get(const Dim<l>& d) {
+HOSTDEVICE int64_t get(const Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int& get(Dim<l>& d) {
+HOSTDEVICE int64_t& get(Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int Dim<l>::operator[](int i) const {
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int& Dim<l>::operator[](int i) {
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
   return indexer(*this, i);
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
 
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
-                                                           int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
+                                                               int i) {
   return d[i];
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
   return d[i];
 }
 
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
   return a.head * b.head + linearize(a.tail, b.tail);
 }
 
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
   return a.head * b.head;
 }
 
 // Product of a Dim
 template <int i>
-HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
   return prod * a.head * product(a.tail);
 }
 
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
   return prod * a.head;
 }
 
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 3898d0a447aa502813b3cb5e86c29eebb814ff84..0a6a87669c900de6cb507dd48f0cfc871defe279 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
   o[0] = paddle::framework::make_dim(5, 6);
 }
 
-__global__ void dyn_idx_gpu(int* o) {
+__global__ void dyn_idx_gpu(int64_t* o) {
   auto d = paddle::framework::make_dim(5, 6);
   o[0] = d[1];
 }
@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
   EXPECT_EQ(b[1], 11);
 
   // dynamic access on GPU
-  thrust::device_vector<int> r(1);
+  thrust::device_vector<int64_t> r(1);
   dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int res = r[0];
+  int64_t res = r[0];
   EXPECT_EQ(res, 6);
 
   // ex_prefix_mul
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
index a4667cc51fadfc020d3211b7a82356db386fced1..2d8d9ae10c56e0632414a5bbc754d35bfa9ce6a5 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -28,7 +28,7 @@ struct EigenDim {
   static Type From(const DDim& dims) {
     PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
     Type ret;
-    for (int d = 0; d < arity(dims); d++) {
+    for (int64_t d = 0; d < arity(dims); d++) {
       ret[d] = dims[d];
     }
     return ret;
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index ae44a1ffd45dacdc44a72edc630e771e7a2f2990..dfcb5fb6210a08f35193b83e3b5f7cee92f618d7 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -22,8 +22,14 @@ enum AttrType {
   INTS = 3;
   FLOATS = 4;
   STRINGS = 5;
+  INT_PAIRS = 6;
 }
 
+message IntPair {
+  required int32 first = 1;
+  required int32 second = 2;
+};
+
 // OpDesc describes an instance of a C++ framework::OperatorBase
 // derived class type.
 message OpDesc {
@@ -37,6 +43,7 @@ message OpDesc {
     repeated int32 ints = 6;
     repeated float floats = 7;
     repeated string strings = 8;
+    repeated IntPair int_pairs = 9;
   };
 
   message Var {
@@ -80,3 +87,24 @@ message OpProto {
   repeated Attr attrs = 4;
   required string comment = 5;
 }
+
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message LoDTensorDesc {
+  required DataType data_type = 1;
+  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message VarDesc {
+  required string name = 1;
+  optional LoDTensorDesc lod_tensor = 2;
+}
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 8a817a3e13ca64d6f8df566891a1059995e041ae..9e3ca563c6765637f8471d142d32cec447f0b977 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
-USE_OP(add_two);
+USE_OP(add);
 
 namespace paddle {
 namespace framework {
@@ -41,7 +41,7 @@ namespace f = paddle::framework;
 
 TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
   std::shared_ptr<f::OperatorBase> grad_add_op =
       f::OpRegistry::CreateGradOp(*add_op);
   EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
@@ -54,8 +54,8 @@ TEST(GradOpBuilder, AddTwo) {
   EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
 }
 
-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, f::NOP);
+REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
+REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
 
 TEST(GradOpBuilder, MutiInOut) {
   std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 71eac4a10b34c3010a2758120c25754af58f669d..908a1f2fd0abe0aa4016c72dbcbc18dcc144232c 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -19,8 +19,8 @@
 namespace paddle {
 namespace framework {
 
-LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
-  LOD new_lod;
+LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+  LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
     new_lod.emplace_back(in.at(i));
@@ -28,10 +28,10 @@ LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   // slice the lod.
-  LOD new_lod;
+  LoD new_lod;
   new_lod.reserve(in.size() - level);
   auto start = in.at(level)[elem_begin];
   auto end = in.at(level)[elem_end];
@@ -46,13 +46,13 @@ LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
     std::transform(new_lod.back().begin(), new_lod.back().end(),
                    new_lod.back().begin(),
                    [start](int v) { return v - start; });
-    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
+    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
   }
   PADDLE_ENFORCE_LE(new_lod.size(), in.size());
   return new_lod;
 }
 
-bool operator==(const LOD& a, const LOD& b) {
+bool operator==(const LoD& a, const LoD& b) {
   if (a.size() != b.size()) {
     return false;
   }
@@ -72,12 +72,12 @@ bool operator==(const LOD& a, const LOD& b) {
   return true;
 }
 
-void LODTensor::SliceLevels(size_t level_begin, size_t level_end) {
+void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
   auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
   lod_ = new_lod;
 }
 
-void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
+void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
   PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
                  NumLevels());
   PADDLE_ENFORCE(elem_begin < NumElements(level),
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9e6b6b4aca41ed464292b56bf6f2d27514f874f7..154068fef69bc96edbd85b731fe8091b3b1ff823 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -35,34 +35,34 @@ template <typename T>
 using Vector = thrust::host_vector<T>;
 #endif
 
-using LOD = std::vector<Vector<size_t>>;
+using LoD = std::vector<Vector<size_t>>;
 
-LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end);
+LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
 
-LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
 
-bool operator==(const LOD& a, const LOD& b);
+bool operator==(const LoD& a, const LoD& b);
 
 /*
- * LODTensor (Level of details Tensor)
+ * LoDTensor (Level of details Tensor)
  * see https://en.wikipedia.org/wiki/Level_of_details for reference.
  */
-class LODTensor {
+class LoDTensor {
  public:
-  LODTensor() {}
-  LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
+  LoDTensor() {}
+  LoDTensor(const LoD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
 
-  void set_lod(const LOD& lod) { lod_ = lod; }
+  void set_lod(const LoD& lod) { lod_ = lod; }
 
   void set_tensor(Tensor* tensor) { tensor_ = tensor; }
 
   Tensor& tensor() { return *tensor_; }
 
-  LOD lod() { return lod_; }
+  LoD lod() { return lod_; }
 
   /*
-   * Get a element from LOD.
+   * Get a element from LoD.
    */
   size_t lod_element(size_t level, size_t elem) const {
     PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
@@ -74,7 +74,7 @@ class LODTensor {
   }
 
   /*
-   * Number of LODTensor's levels, each level has units of data, for example,
+   * Number of LoDTensor's levels, each level has units of data, for example,
    * in the sentence's view, article, paragraph, sentence are 3 levels.
    */
   size_t NumLevels() const { return lod_.size(); }
@@ -100,7 +100,7 @@ class LODTensor {
   void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
  private:
-  LOD lod_;
+  LoD lod_;
   Tensor* tensor_;  // not owned
 };
 }  // namespace framework
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 8dfe3ee823084cb8c38550a82e761a741eabe135..769b61f175a2f462258c1242d027c04c0abd12a9 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -94,7 +94,7 @@ Let's go on slicing this slice.  Its <1,1>-slice is
 |||
 ```
 
-### The General Slicing Algorithm
+### The Slicing Algorithm
 
 The algorithm, with over-simplified data structure, is defined as
 
@@ -106,17 +106,41 @@ struct LoDTensor {
   float* tensor_;
 };
 
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) {
+LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+
+Let us revisit the example above
 
-}
+```
+         3
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
 ```
 
-### Slicing the Top Level
+Suppose that we want to retrieve the <1,2>-slice
 
-Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs.
+```
+2
+2  3
+|| |||
+```
 
-```c++
-LoDTensor Slice(const LoDTensor& lodt, int sequence) {
+we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+
+To avoid the traversal of the LoD tree at slcing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+
+```
+        0
+0           9  10
+0   3  5    9  10 12
+||| || |||| |  || |||
+```
+
+We don't really need the 0 on top, so the LoD Tensor could be
 
-}
+```
+0           9  10
+0   3  5    9  10 12
+||| || |||| |  || |||
 ```
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 9a351605edb5013bdab2c6193bdd9ce401acc937..1da8553134f377f7a4fbe8008d12fe8d4a0e47f4 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace framework {
 
-class LODTensorTester : public ::testing::Test {
+class LoDTensorTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
     // tensor's batch_size: 30
@@ -29,7 +29,7 @@ class LODTensorTester : public ::testing::Test {
     // 0 10 20
     // 0 5 10 15 20
     // 0 2 5 7 10 12 15 20
-    LOD lod;
+    LoD lod;
     lod.push_back(std::vector<size_t>{0, 10, 20});
     lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
     lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
@@ -47,21 +47,21 @@ class LODTensorTester : public ::testing::Test {
  protected:
   platform::CPUPlace place;
   Tensor tensor;
-  LODTensor lod_tensor;
+  LoDTensor lod_tensor;
 };
 
-TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
+TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
 
-TEST_F(LODTensorTester, NumElements) {
+TEST_F(LoDTensorTester, NumElements) {
   ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
   ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
   ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
 }
 
-TEST_F(LODTensorTester, SliceLevels) {
+TEST_F(LoDTensorTester, SliceLevels) {
   // slice 1 level
   for (size_t level = 0; level < 3UL; ++level) {
-    LODTensor new_lod_tensor = lod_tensor;
+    LoDTensor new_lod_tensor = lod_tensor;
     new_lod_tensor.SliceLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
@@ -70,7 +70,7 @@ TEST_F(LODTensorTester, SliceLevels) {
   }
   // slice 2 level
   for (size_t level = 0; level < 2UL; ++level) {
-    LODTensor new_lod_tensor = lod_tensor;
+    LoDTensor new_lod_tensor = lod_tensor;
     new_lod_tensor.SliceLevels(level, level + 2);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
     ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
@@ -80,9 +80,9 @@ TEST_F(LODTensorTester, SliceLevels) {
   }
 }
 
-TEST_F(LODTensorTester, SliceInLevel) {
+TEST_F(LoDTensorTester, SliceInLevel) {
   size_t level = 0;
-  LODTensor new_lod_tensor = lod_tensor;
+  LoDTensor new_lod_tensor = lod_tensor;
   new_lod_tensor.SliceInLevel(level, 0, 2);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
   EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 64c7f23ab6b79bad9533f566ca39db3cfd5ac5c5..572dff860a306bb03ba9e6702fec85e4a2ea1b54 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -33,7 +33,8 @@ namespace framework {
 class OpRegistry {
  public:
   template <typename OpType, typename ProtoMakerType, typename GradOpType>
-  static void RegisterOp(const std::string& op_type) {
+  static void RegisterOp(const std::string& op_type,
+                         const std::string& grad_op_type) {
     PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                    "'%s' is registered more than once.", op_type);
     OpInfo op_info;
@@ -42,9 +43,9 @@ class OpRegistry {
         const VariableNameMap& outputs, const AttributeMap& attrs) {
       return new OpType(type, inputs, outputs, attrs);
     };
+    op_info.grad_op_type_ = grad_op_type;
     if (std::type_index(typeid(ProtoMakerType)) !=
         std::type_index(typeid(NOPMaker))) {
-      op_info.grad_op_type_ = op_type + "_grad";
       op_info.proto_ = new OpProto;
       op_info.checker_ = new OpAttrChecker;
       auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
@@ -54,14 +55,15 @@ class OpRegistry {
           op_info.proto_->IsInitialized(),
           "Fail to initialize %s's OpProto, because %s is not initialized",
           op_type, op_info.proto_->InitializationErrorString());
-      // register gradient op
-      RegisterOp<GradOpType, NOPMaker, NOP>(op_info.grad_op_type_);
     } else {
-      op_info.grad_op_type_ = "";
       op_info.proto_ = nullptr;
       op_info.checker_ = nullptr;
     }
     OpInfoMap::Instance().Insert(op_type, op_info);
+    // register gradient op
+    if (!grad_op_type.empty()) {
+      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
+    }
   }
 
   static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
@@ -90,8 +92,10 @@ class Registrar {
 template <typename OpType, typename ProtoMakerType, typename GradOpType>
 class OpRegistrar : public Registrar {
  public:
-  explicit OpRegistrar(const char* op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type);
+  explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
+  OpRegistrar(const char* op_type, const char* grad_op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
+                                                               grad_op_type);
   }
 };
 
@@ -117,7 +121,8 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register Operator.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_class)         \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
+                    grad_op_class)                                            \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
       __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
   class _OpClass_##op_type##_ : public op_class {                             \
@@ -132,14 +137,14 @@ class OpKernelRegistrar : public Registrar {
   };                                                                          \
   static ::paddle::framework::OpRegistrar<                                    \
       _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type);                                 \
+      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
   int TouchOpRegistrar_##op_type() {                                          \
     __op_registrar_##op_type##__.Touch();                                     \
     return 0;                                                                 \
   }
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, ::paddle::framework::NOP)
+  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
 
 /**
  * Macro to register OperatorKernel.
@@ -194,6 +199,8 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_DEVICE_KERNEL(op_type, GPU)
 #endif
 
+#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
+
 #define USE_CPU_ONLY_OP(op_type) \
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 50c45919c53af22665feeeebe753da283ded2b0c..e00c6e8d904508ec9985537fc703c7c61a14e0de 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -21,7 +21,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddOutput("output", "output of cosine op");
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
-        .LargerThan(0.0);
+        .GreaterThan(0.0);
     AddComment("This is cos op");
   }
 };
@@ -80,7 +80,7 @@ TEST(OpRegistry, CreateOp) {
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
-  float scale_get = op->GetAttr<float>("scale");
+  float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
 
@@ -121,7 +121,7 @@ TEST(OpRegistry, DefaultValue) {
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
-  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
+  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -172,38 +172,6 @@ TEST(OpRegistry, CustomChecker) {
   paddle::platform::CPUDeviceContext dev_ctx;
   paddle::framework::Scope scope;
   op->Run(scope, dev_ctx);
-  int test_attr = op->GetAttr<int>("test_attr");
+  int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
-}
-
-class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
- public:
-  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<float>("scale", "scale of test op");
-    AddAttr<float>("scale", "scale of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedAttr) {
-  pd::OpProto op_proto;
-  pd::OpAttrChecker op_checker;
-  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
-
-class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
- public:
-  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddInput("input", "input of test op");
-  }
-};
-
-TEST(ProtoMaker, DuplicatedInOut) {
-  pd::OpProto op_proto;
-  pd::OpAttrChecker op_checker;
-  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
+}
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 590e335fdc8843ed9edd01a09605163de93f52d9..9a98d4d3be0d1cb875d614b263f1e4365ede4113 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -69,7 +69,7 @@ class OperatorBase {
   virtual ~OperatorBase() {}
 
   template <typename T>
-  inline const T& GetAttr(const std::string& name) const {
+  inline const T& Attr(const std::string& name) const {
     PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
                    name);
     return boost::get<T>(attrs_.at(name));
@@ -233,6 +233,15 @@ class InferShapeContext {
   InferShapeContext(const OperatorBase& op, const Scope& scope)
       : op_(op), scope_(scope) {}
 
+  const OperatorBase& op() const { return op_; }
+
+  const Scope& scope() const { return scope_; }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return op_.Attr<T>(name);
+  }
+
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
   }
@@ -314,6 +323,7 @@ class InferShapeContext {
     return res;
   }
 
+ private:
   const OperatorBase& op_;
   const Scope& scope_;
 };
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 1d7efb7b9403f7c1c6bdbb27a0258f79ae032f43..20bbb11896a4c6f11079669f0b25773f6460594d 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -102,7 +102,7 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddOutput("y", "output of test op");
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
-        .LargerThan(0.0);
+        .GreaterThan(0.0);
     AddComment("This is test op");
   }
 };
@@ -122,10 +122,10 @@ class CPUKernelTest : public OpKernel {
  public:
   void Compute(const ExecutionContext& ctx) const {
     std::cout << "this is cpu kernel" << std::endl;
-    std::cout << ctx.op_.DebugString() << std::endl;
+    std::cout << ctx.op().DebugString() << std::endl;
     cpu_kernel_run_num++;
-    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
-    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
   }
 };
 
@@ -140,7 +140,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
     AddOutput("ys", "outputs of test op").AsDuplicable();
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
-        .LargerThan(0.0);
+        .GreaterThan(0.0);
     AddComment("This is test op");
   }
 };
@@ -148,7 +148,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
 class CPUKernalMultiInputsTest : public OpKernel {
  public:
   void Compute(const ExecutionContext& ctx) const {
-    auto xs = ctx.op_.Inputs("xs");
+    auto xs = ctx.op().Inputs("xs");
     ASSERT_EQ(xs.size(), 3UL);
     ASSERT_EQ(xs[0], "x0");
     ASSERT_EQ(xs[1], "x1");
@@ -172,10 +172,10 @@ class CPUKernalMultiInputsTest : public OpKernel {
     auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
     ASSERT_EQ(outTensor0.size(), 2U);
 
-    auto k = ctx.op_.Input("k");
+    auto k = ctx.op().Input("k");
     ASSERT_EQ(k, "k0");
 
-    auto ys = ctx.op_.Outputs("ys");
+    auto ys = ctx.op().Outputs("ys");
     ASSERT_EQ(ys.size(), 2UL);
     ASSERT_EQ(ys[0], "y0");
     ASSERT_EQ(ys[1], "y1");
@@ -263,4 +263,38 @@ TEST(Operator, Clone) {
   OperatorClone a("ABC", {}, {}, {});
   auto b = a.Clone();
   ASSERT_EQ(a.Type(), b->Type());
+}
+
+class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+                     paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  paddle::framework::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+                      paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  paddle::framework::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
\ No newline at end of file
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7893e233b776425a61d9e3edd43d944a27743188..94f436294f350e2a39785a09959efb3b17bd00a5 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -58,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
   /* some versions of boost::variant don't have operator!= */
-  size_t size = product(dims_) * sizeof(T);
+  int64_t size = product(dims_) * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
                     "Begin index must be less than end index.");
   PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  int base = product(dims_) / dims_[0];
+  size_t base = product(dims_) / dims_[0];
   Tensor dst;
   dst.holder_ = holder_;
   DDim dst_dims = dims_;
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 132119015f967c6e8d055792de8afe8450df5ec6..92087fa32b1e48b50fbf447ec6f3c43e2a510220 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/StringUtil.h"
 
 namespace paddle {
 
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public NotGetableEvaluator {
+class CTCErrorEvaluator : public Evaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
   real deletions_, insertions_, substitutions_;
   int seqClassficationError_;
+  mutable std::unordered_map<std::string, real> evalResults_;
 
   std::vector<int> path2String(const std::vector<int>& path) {
     std::vector<int> str;
@@ -183,6 +185,18 @@ private:
     return stringAlignment(gtStr, recogStr);
   }
 
+  void storeLocalValues() const {
+    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
+    evalResults_["deletion_error"] =
+        numSequences_ ? deletions_ / numSequences_ : 0;
+    evalResults_["insertion_error"] =
+        numSequences_ ? insertions_ / numSequences_ : 0;
+    evalResults_["substitution_error"] =
+        numSequences_ ? substitutions_ / numSequences_ : 0;
+    evalResults_["sequence_error"] =
+        (real)seqClassficationError_ / numSequences_;
+  }
+
 public:
   CTCErrorEvaluator()
       : numTimes_(0),
@@ -245,16 +259,12 @@ public:
   }
 
   virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSequences_ ? totalScore_ / numSequences_ : 0);
-    os << "  deletions error"
-       << "=" << (numSequences_ ? deletions_ / numSequences_ : 0);
-    os << "  insertions error"
-       << "=" << (numSequences_ ? insertions_ / numSequences_ : 0);
-    os << "  substitutions error"
-       << "=" << (numSequences_ ? substitutions_ / numSequences_ : 0);
-    os << "  sequences error"
-       << "=" << (real)seqClassficationError_ / numSequences_;
+    storeLocalValues();
+    os << config_.name() << " error = " << evalResults_["error"];
+    os << " deletions error = " << evalResults_["deletion_error"];
+    os << " insertions error = " << evalResults_["insertion_error"];
+    os << " substitution error = " << evalResults_["substitution_error"];
+    os << " sequence error = " << evalResults_["sequence_error"];
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -272,6 +282,37 @@ public:
     seqClassficationError_ = (int)buf[4];
     numSequences_ = (int)buf[5];
   }
+
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + evalResults_.size());
+    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = evalResults_.find(buffers[buffers.size() - 1]);
+
+    if (it == evalResults_.end()) {
+      *err = Error("Evaluator does not have the key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "ctc_edit_distance";
+  }
 };
 
 REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 1658282f3a5f79b128ce8685e92fd5cf9db2e41a..a2ab15eedee4aaa7b47504d50e25300359f18173 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -268,7 +268,13 @@ public:
   }
 
   // get type of evaluator
-  std::string getTypeImpl() const { return "chunk"; }
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "chunk";
+  }
 
 private:
   void storeLocalValues() const {
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b114500e2b7c1e460a02c78b99b5f1a8fb63b8c3..90203553e0a5fe8cc8183274f374da178bae30d0 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -211,6 +211,7 @@ public:
     *err = Error("Not implemented");
     return .0f;
   }
+
   std::string getType(const std::string& name, Error* err) const {
     *err = Error("Not implemented");
     return "";
@@ -331,6 +332,7 @@ private:
 protected:
   std::string getTypeImpl() const;
 };
+
 /**
  * @brief precision, recall and f1 score Evaluator
  * \f[
@@ -358,6 +360,12 @@ public:
 
   virtual void distributeEval(ParameterClient2* client);
 
+  void getNames(std::vector<std::string>* names);
+
+  real getValue(const std::string& name, Error* err) const;
+
+  std::string getType(const std::string& name, Error* err) const;
+
   struct StatsInfo {
     /// numbers of true positives
     double TP;
@@ -428,11 +436,6 @@ private:
   mutable std::unordered_map<std::string, real> values_;
 
   void storeLocalValues() const;
-  // Evaluator interface
-public:
-  void getNames(std::vector<std::string>* names);
-  real getValue(const std::string& name, Error* err) const;
-  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
index 7cc9937cce37cbbc4640fbb88312841c23b757c0..9deda2de989a55d34510560c49b213ea1a52fd07 100644
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@@ -42,10 +42,10 @@ bool Conv3DLayer::init(const LayerMap &layerMap,
     if (sharedBiases_) {
       CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
       biases_ =
-          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
     } else {
       biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
     }
   }
   return true;
@@ -83,8 +83,8 @@ void Conv3DLayer::forward(PassType passType) {
   int outWidth = getSize();
   resetOutput(batchSize, outWidth);
 
+  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
     const MatrixPtr &inMat = getInputValue(i);
     const MatrixPtr &outMat = getOutputValue();
     int M = M_[i];
@@ -120,7 +120,6 @@ void Conv3DLayer::forward(PassType passType) {
     }
   }
   if (nullptr != this->biasParameter_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
     this->addBias();
   }
   forwardActivation();
@@ -134,15 +133,14 @@ void Conv3DLayer::backward(const UpdateCallback &callback) {
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
+  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
     if (weights_[i]->getWGrad()) {
       bpropWeights(i);
     }
     if (getInputGrad(i)) {
       bpropData(i);
     }
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     weights_[i]->getParameterPtr()->incUpdate(callback);
   }
 }
@@ -224,20 +222,31 @@ void Conv3DLayer::bpropData(int i) {
 }
 
 void Conv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
   MatrixPtr outGradMat = getOutputGrad();
+
   if (this->sharedBiases_) {
-    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+    biases->collectSharedBias(*outGradMat, 1.0f);
   } else {
-    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+    biases->collectBias(*outGradMat, 1.0f);
   }
 }
 
 void Conv3DLayer::addBias() {
   MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   if (this->sharedBiases_) {
-    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+    outMat->addSharedBias(*(bias), 1.0f);
   } else {
-    outMat->addBias(*(biases_->getW()), 1.0f);
+    outMat->addBias(*(bias), 1.0f);
   }
 }
 
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
index 7d5c772c89d260264a59f4cc4439bb8a44c605a4..1b59ed60c57fe3bbfa814befa8a63408a2621715 100644
--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
@@ -42,10 +42,10 @@ bool DeConv3DLayer::init(const LayerMap &layerMap,
     if (sharedBiases_) {
       CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
       biases_ =
-          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
     } else {
       biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
     }
   }
   return true;
@@ -84,8 +84,8 @@ void DeConv3DLayer::forward(PassType passType) {
   resetOutput(batchSize, outWidth);
   const MatrixPtr outMat = getOutputValue();
 
+  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
     const MatrixPtr &inMat = getInputValue(i);
     int M = M_[i];
     int N = N_[i];
@@ -120,7 +120,6 @@ void DeConv3DLayer::forward(PassType passType) {
     }
   }
   if (nullptr != this->biasParameter_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
     this->addBias();
   }
   forwardActivation();
@@ -133,12 +132,12 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
     bpropBiases();
     biases_->getParameterPtr()->incUpdate(callback);
   }
+  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
     if (weights_[i]->getWGrad() || this->needGradient_) {
       int M = M_[i];
       int N = N_[i];
       int K = K_[i];
-      REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
       Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
       const MatrixPtr &inMat = getInputValue(i);
       for (int n = 0; n < batchSize; ++n) {
@@ -182,7 +181,6 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
           }
         }
       }
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
   }
@@ -191,21 +189,31 @@ void DeConv3DLayer::bpropWeights(int i) {}
 void DeConv3DLayer::bpropData(int i) {}
 
 void DeConv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
   const MatrixPtr &outGradMat = getOutputGrad();
 
   if (this->sharedBiases_) {
-    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+    biases->collectSharedBias(*outGradMat, 1.0f);
   } else {
-    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+    biases->collectBias(*outGradMat, 1.0f);
   }
 }
 
 void DeConv3DLayer::addBias() {
   MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   if (this->sharedBiases_) {
-    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+    outMat->addSharedBias(*(bias), 1.0f);
   } else {
-    outMat->addBias(*(biases_->getW()), 1.0f);
+    outMat->addBias(*(bias), 1.0f);
   }
 }
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index e5efcccb0e219a1c9df888cfec7f8902806676d4..f9ea25ab045a02be5ab9ed81ef9c679126d3a188 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -14,27 +14,31 @@ function(op_library TARGET)
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN})
 
-    foreach(src ${op_library_SRCS})
-        if (${src} MATCHES ".*\\.cu$")
-            list(APPEND cu_srcs ${src})
-        elseif(${src} MATCHES ".*\\.cc$")
-            list(APPEND cc_srcs ${src})
-        else()
-            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
         endif()
-    endforeach()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH op_library_DEPS dep_len)
-    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
-        message(WARNING "The op library ${TARGET} not support GPU!")
-    endif()
-
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -46,22 +50,22 @@ endfunction()
 
 add_subdirectory(math)
 
-list(REMOVE_ITEM GENERAL_OPS
-     net_op
-     minus_op
-     mul_op
-     recurrent_op
-     scale_op)
-
-op_library(net_op SRCS net_op.cc)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
+set(DEPS_OPS
+    identity_op
+    minus_op
+    mul_op
+    recurrent_op
+    scale_op)
+op_library(identity_op DEPS scale_op)
+op_library(minus_op DEPS scale_op)
+op_library(mul_op DEPS math_function)
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor operator net_op)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(scale_op DEPS net_op)
 
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
-    op_library(${src} SRCS ${src}.cc ${src}.cu)
+    op_library(${src})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 6384d8c8ce13dae8b58ed1069d496dd8e93eaa8a..8dbd47cf0dfbc265032a9966343eed5c7bd8692e 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, ops::AddOpGrad);
+REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
 
-REGISTER_OP_CPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index cec5f558cbc161124620ad4241d6bd8a5324277c..d9c6d20a6c320b59e57ed25da3dd8b093833f8c7 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -12,10 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c033af3b741ae26ad9d37b2164f87aa6e8651c6e
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cos_sim_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                      ctx.Input<Tensor>("Y")->dims(),
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("XNorm")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("YNorm")->Resize({dims[0], 1});
+  }
+};
+
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of cos_sim op.");
+    AddInput("Y", "The second input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
+    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+)DOC");
+  }
+};
+
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
+                            "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
+                            "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
+    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
+                      "1st dimension of XNorm must equal that of Input(X).");
+    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
+    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
+                      "1st dimension of YNorm must equal that of Input(Y).");
+    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "1st dimension of Out@GRAD must equal that of Input(X)");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
+
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/cos_sim_op.cu
similarity index 72%
rename from paddle/operators/gather_op.cu
rename to paddle/operators/cos_sim_op.cu
index 3f04a7b3f8142106917975cd1e0413fa1633a298..0cb8fd26de47a4a464db98664263544e3e503d63 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -13,8 +13,10 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/gather_op.h"
+#include "paddle/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e2bcebe3b5432c157fac895a9bbab5164193dbb
--- /dev/null
+++ b/paddle/operators/cos_sim_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class CosSimKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* output_z = context.Output<Tensor>("Out");
+    auto* output_x_norm = context.Output<Tensor>("XNorm");
+    auto* output_y_norm = context.Output<Tensor>("YNorm");
+
+    output_z->mutable_data<T>(context.GetPlace());
+    output_x_norm->mutable_data<T>(context.GetPlace());
+    output_y_norm->mutable_data<T>(context.GetPlace());
+
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenVector<T>::Flatten(*output_z);
+    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
+    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
+
+    auto place = context.GetEigenDevice<Place>();
+    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    z.device(place) = xy / x_norm / y_norm;
+  }
+};
+
+template <typename Place, typename T>
+class CosSimGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* input_z = context.Input<Tensor>("Out");
+    auto* input_x_norm = context.Input<Tensor>("XNorm");
+    auto* input_y_norm = context.Input<Tensor>("YNorm");
+    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenMatrix<T>::From(*input_z);
+    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
+    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
+    auto dz = EigenMatrix<T>::From(*input_grad_z);
+
+    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
+    auto z_bcast = z.broadcast(bcast);
+    auto dz_bcast = dz.broadcast(bcast);
+    auto place = context.GetEigenDevice<Place>();
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
+    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
+    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
+    if (output_grad_x) {
+      output_grad_x->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
+      dx.device(place) =
+          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
+    }
+    if (output_grad_y) {
+      output_grad_y->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
+      dy.device(place) =
+          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index ac76326262c88e2014cf64f7fb73b5a7338ab3e9..ab1e1c101a10e09a81f7785d2f1514822e3bdf15 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -67,7 +67,8 @@ OnehotCrossEntropy Operator.
 
 namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
-            ops::OnehotCrossEntropyOpMaker, ops::OnehotCrossEntropyGradientOp);
+            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
+            ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
                        ops::OnehotCrossEntropyOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 07fa704824174f939e459093b245036771d9cd4f..123bed296c462c30bddd3bfbd530098fdbfe4856 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -63,7 +63,8 @@ Out = X[Index]
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOp);
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather,
                        ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index a85363ad81d2a23e7267026c067f74f8c94c4786..6574880c0eb6324b2dd175e39a364d2ef46e735e 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -19,21 +19,20 @@ template <typename T>
 class CPUGaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.op_.GetAttr<float>("mean");
-    float std = context.op_.GetAttr<float>("std");
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    unsigned int seed =
-        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
     engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -46,10 +45,15 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
-    auto dims = GetAttr<std::vector<int>>("dims");
+    auto dims = Attr<std::vector<int>>("dims");
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
     PADDLE_ENFORCE(dims.size() > 0UL,
                    "dims can be one int or array. dims must be set.");
-    tensor->Resize(framework::make_ddim(dims));
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 018a4bfcb26b9008c054000c91edf01e371fd82b..d9dbc1dcfe6a6676938d64be93c879ea69148018 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -42,14 +42,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed =
-        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
-    T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
-    T std = static_cast<T>(context.op_.GetAttr<float>("std"));
+    T mean = static_cast<T>(context.Attr<float>("mean"));
+    T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     ssize_t N = framework::product(tensor->dims());
     thrust::transform(index_sequence_begin, index_sequence_begin + N,
diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be956bf3b320d6beacdb0d2ca742c3e854194b19
--- /dev/null
+++ b/paddle/operators/identity_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index c3108ba8ec7ad85bd3485c135bf03e514bc66cd1..94d40890a765413e88a35a6ad995ca97ac84dcda 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -66,7 +66,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
-            ops::LookupTableOpGrad);
+            lookup_table_grad, ops::LookupTableOpGrad);
 
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index 4da8079b91624c3510cae89fd599a7035a4c7477..877b36cef4ea9cdaaaf37c97d5e5bfce55b91436 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -30,12 +30,12 @@ class LookupTableKernel : public framework::OpKernel {
     auto ids_t = context.Input<Tensor>("Ids");      // int tensor
     auto output_t = context.Output<Tensor>("Out");  // float tensor
 
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
+    int N = table_t->dims()[0];
+    int D = table_t->dims()[1];
     auto ids = ids_t->data<int32_t>();
     auto table = table_t->data<T>();
     auto output = output_t->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
       PADDLE_ENFORCE_LT(ids[i], N);
       PADDLE_ENFORCE_GE(ids[i], 0);
       memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
@@ -51,8 +51,8 @@ class LookupTableGradKernel : public framework::OpKernel {
     auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
     auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
 
-    size_t N = d_table_t->dims()[0];
-    size_t D = d_table_t->dims()[1];
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
     auto ids = ids_t->data<int32_t>();
     const T* d_output = d_output_t->data<T>();
     T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
@@ -61,10 +61,10 @@ class LookupTableGradKernel : public framework::OpKernel {
     t.device(context.GetEigenDevice<platform::CPUPlace>()) =
         t.constant(static_cast<T>(0));
 
-    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
       PADDLE_ENFORCE_LT(ids[i], N);
       PADDLE_ENFORCE_GE(ids[i], 0);
-      for (size_t j = 0; j < D; ++j) {
+      for (int j = 0; j < D; ++j) {
         d_table[ids[i] * D + j] += d_output[i * D + j];
       }
     }
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index e66e0abb25f9b933025a6d098ed9dd9eb18a47a5..d3d0e55a674587fb04f43f24d0790de4358f035a 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -54,7 +54,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradOp);
+REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean,
                        ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index b4afebcd97a8efff70aaaa85bc2ec5455ddd05c5..069fb5e1abc657aa02a50fde352ce88d078c36e1 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -79,8 +79,9 @@ class MinusGradOp : public NetOp {
 }  // namespace paddle
 
 USE_OP(scale);
-USE_OP_ITSELF(identity);
+USE_NO_KERNEL_OP(identity);
 namespace ops = paddle::operators;
-REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradOp<float>);
+REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
+            ops::MinusGradOp<float>);
 REGISTER_OP_CPU_KERNEL(minus,
                        ops::MinusKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 559d19e6bdc083fffebe1c82a0bebbb18dd134fd..28a47cdff2e9b7a965ff9f99e787bb8315010823 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -29,10 +29,10 @@ class MulOp : public framework::OperatorWithKernel {
     auto dim1 = ctx.Input<Tensor>("Y")->dims();
     PADDLE_ENFORCE_EQ(dim0.size(), 2,
                       "input X(%s) should be a tensor with 2 dims, a matrix",
-                      ctx.op_.Input("X"));
+                      ctx.op().Input("X"));
     PADDLE_ENFORCE_EQ(dim1.size(), 2,
                       "input Y(%s) should be a tensor with 2 dims, a matrix",
-                      ctx.op_.Input("Y"));
+                      ctx.op().Input("Y"));
     PADDLE_ENFORCE_EQ(
         dim0[1], dim1[0],
         "First matrix's width must be equal with second matrix's height.");
@@ -75,8 +75,8 @@ class MulOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(y_dims[1] == out_dims[1],
                    "Out@GRAD M X N must equal to Y dims 1, N ");
 
-    x_grad->Resize(x_dims);
-    y_grad->Resize(y_dims);
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
   }
 };
 
@@ -84,7 +84,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mul_grad,
                        ops::MulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 8facc0281449785bf40726f23ca2fd5d166ff272..05a79e13b3470e39a5ebd0394ba05629553a5075 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -31,13 +31,13 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Y = context.Input<Tensor>("Y");
-    auto* Z = context.Output<Tensor>("Out");
-    Z->mutable_data<T>(context.GetPlace());
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
     auto* device_context =
         const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    math::matmul<Place, T>(*x, false, *y, false, 1, z, 0, device_context);
   }
 };
 
@@ -45,20 +45,24 @@ template <typename Place, typename T>
 class MulGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>("X");
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    dX->mutable_data<T>(ctx.GetPlace());
-    dY->mutable_data<T>(ctx.GetPlace());
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto* device_context =
         const_cast<platform::DeviceContext*>(ctx.device_context_);
-    // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N
-    math::matmul<Place, T>(*dOut, false, *Y, true, 1, dX, 0, device_context);
-    // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K
-    math::matmul<Place, T>(*X, true, *dOut, false, 1, dY, 0, device_context);
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      math::matmul<Place, T>(*dout, false, *y, true, 1, dx, 0, device_context);
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      math::matmul<Place, T>(*x, true, *dout, false, 1, dy, 0, device_context);
+    }
   }
 };
 
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 16bd249cb3d989c695ec9378f09d48833d70be58..e826703c60ca82e1fe690eb78c3d4f92981ef3a2 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -235,5 +235,5 @@ RecurrentGradientOp::RecurrentGradientOp(
 }  // namespace paddle
 
 REGISTER_OP_WITHOUT_GRADIENT(
-    recurrent_op, paddle::operators::RecurrentOp,
+    recurrent, paddle::operators::RecurrentOp,
     paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a9b65c30f25554e54e9fd7103f240946a93566e2..97872c67ac99fbf6c9c177d52f1d4069163e8548 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -61,7 +61,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
       PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                      outlinks[i].internal);
       f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
-      std::vector<int> dims_vec = vectorize(step_dims);
+      std::vector<int64_t> dims_vec = vectorize(step_dims);
       dims_vec.insert(dims_vec.begin(), seq_len);
       output->Resize(f::make_ddim(dims_vec));
     } else {
@@ -109,7 +109,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->step_scopes = op.Output(name.step_scopes);
 
   auto inlinks = op.Inputs(name.inlinks);
-  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  auto inlink_alias = op.Attr<std::vector<std::string>>(name.inlink_alias);
   PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
                  "the size of inlinks and inlink_alias don't match:%d,%d",
                  inlinks.size(), inlink_alias.size());
@@ -121,7 +121,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   }
 
   auto outlinks = op.Outputs(name.outlinks);
-  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  auto outlink_alias = op.Attr<std::vector<std::string>>(name.outlink_alias);
   PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
                  "the size of outlinks and outlink_alias don't match:%d,%d",
                  outlinks.size(), outlink_alias.size());
@@ -135,8 +135,8 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   auto boot_memories = op.Inputs(name.boot_memories);
 
   // attributes
-  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+  auto memories = op.Attr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
                  "the size of memories, boot_memories don't match:%d,%d",
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 63de91254f4b75587cb2fb29aeb8ff7358ba8e76..30b4b404315a9f041e21d79b75fd06307e33f7f9 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -64,8 +64,10 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
     auto dims0 = ctx.Input<Tensor>("X")->dims();
     auto dims1 = ctx.Input<Tensor>("b")->dims();
     PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
-    ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
-    ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *db = ctx.Output<Tensor>(framework::GradVarName("b"));
+    if (dx) dx->Resize(dims0);
+    if (db) db->Resize(dims1);
   }
 };
 
@@ -74,7 +76,7 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            ops::RowwiseAddGradOp);
+            rowwise_add_grad, ops::RowwiseAddGradOp);
 REGISTER_OP_CPU_KERNEL(
     rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 1cbd8bb31ad90a32d8a4e3bb59617d0b5384e470..4e926d9f2947f37b71e81c0fa592b0c66b19c640 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -51,20 +51,24 @@ template <typename Place, typename T>
 class RowwiseAddGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
     auto* db = context.Output<Tensor>(framework::GradVarName("b"));
-    dX->mutable_data<T>(context.GetPlace());
-    db->mutable_data<T>(context.GetPlace());
 
-    auto OutGrad = EigenMatrix<T>::From(*dOut);
+    auto out_grad = EigenMatrix<T>::From(*dout);
     auto place = context.GetEigenDevice<Place>();
-    EigenMatrix<T>::From(*dX).device(place) = OutGrad;
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      EigenMatrix<T>::From(*dx).device(place) = out_grad;
+    }
 
-    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-    // colwise add
-    Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
-    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
+    if (db) {
+      db->mutable_data<T>(context.GetPlace());
+      // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
+      // colwise add
+      Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
+      EigenVector<T>::Flatten(*db).device(place) = out_grad.sum(dims);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 4e039688d4d74f2a101fc91c747bd1e6ebec7ad2..3d82b345829b0a554a204ada91c807e42b71dc58 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -48,7 +48,7 @@ The equation is: Out = scale*X
   }
 };
 
-// Identity Op's gradient is identity op, too.
+// Scale Op's gradient is scale op, too.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {
@@ -60,46 +60,17 @@ class ScaleGradOp : public NetOp {
     AppendOp(framework::OpRegistry::CreateOp(
         "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
         {{"Out", {Output(framework::GradVarName("X"))}}},
-        {{"scale", GetAttr<AttrType>("scale")}}));
+        {{"scale", Attr<AttrType>("scale")}}));
     CompleteAddOp(false);
   }
 };
 
-// identity is a alias of scale op. This is also a example for creating a alias
-// operator.
-template <typename AttrType>
-class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IdentityOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
-    AddOutput("Out", "output tensor of identity op");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
-  }
-};
-
-template <typename AttrType>
-class IdentityOp : public NetOp {
- public:
-  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
-        {{"scale", static_cast<AttrType>(1)}}));
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
+REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
             ops::ScaleGradOp<float>);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
-                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index aea64f1b0428ffe79ba8d90cf79dbfd2b5ef36f4..02fbdc52bbf89c9f2acc5eeaa1197e4ccbca9d31 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel {
     auto* in = context.Input<framework::Tensor>("X");
     tensor->mutable_data<T>(in->place());
 
-    auto scale = static_cast<T>(context.op_.GetAttr<AttrType>("scale"));
+    auto scale = static_cast<T>(context.Attr<AttrType>("scale"));
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 35c185ad80f93d1005c1616dcffd2e61bcd54222..f901edefa22dc9a252e87116df756d04767a7162 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -77,7 +77,8 @@ Out[Index] = Ref[Index] + Updates
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, ops::ScatterGradOp);
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+            ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter,
                        ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index a0b5000ffbf54364e15f87870913926a071fa972..f8888f9c362e1c39af42236bb3a23be37aa3ae15 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
     auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.op_.GetAttr<float>("learning_rate");
+    float lr = ctx.Attr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index f35b7023845bac52887d81a8f5c496cb5e7193aa..761c6de8d4d2150b30b97b58da95da3d5f33db63 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -53,7 +53,8 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, ops::SigmoidOpGrad);
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::SigmoidOpGrad);
 REGISTER_OP_CPU_KERNEL(sigmoid,
                        ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 471bb288fb20f113aefb2a9e13eb805b161b0631..7d062ad67c048bc6bef68121f86334eb3f1efe92 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,7 +24,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
-                   "The input of softmax op must be matrix");
+                   "The input of softmax op must be a matrix.");
     ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
@@ -34,9 +34,27 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
   SoftmaxOpMaker(framework::OpProto *proto,
                  framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input of softmax");
-    AddOutput("Y", "output of softmax");
-    AddComment("Softmax Op");
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+The input of softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1. Specifically, it computes the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions in the K-dimensional vector input. Then the ratio of the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions is the output of the softmax operator.
+
+For each row `i` and each column `j` in X, we have:
+    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+
+)DOC");
   }
 };
 
@@ -62,7 +80,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpGrad);
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
+            ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax,
                        ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc30644a5e7e33d4289e48cac093aa5fde7e75e7
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/squared_l2_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SquaredL2DistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input of SquaredL2DistanceOp "
+                            "must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Target of SquaredL2DistanceOp "
+                            "must be initialized.");
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto x_dims = x->dims();
+    auto* y = ctx.Input<Tensor>("Y");
+    auto y_dims = y->dims();
+
+    PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
+                      "Tensor rank of both SquaredL2DistanceOp's "
+                      "inputs must be same.");
+
+    int rank = framework::arity(x_dims);
+    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
+    PADDLE_ENFORCE_EQ(framework::product(x_dims) / x_dims[0],
+                      framework::product(y_dims) / y_dims[0],
+                      "Product of dimensions expcet the first dimension of "
+                      "input and target must be equal.");
+    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                   "First dimension of target must be equal to input "
+                   "or to 1.");
+
+    ctx.Output<Tensor>("sub_result")
+        ->Resize({static_cast<int>(x_dims[0]),
+                  static_cast<int>(framework::product(x_dims) / x_dims[0])});
+    ctx.Output<Tensor>("Out")->Resize({x_dims[0], 1});
+  }
+};
+
+class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquaredL2DistanceOpMaker(framework::OpProto* proto,
+                           framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SquaredL2DistanceOp.");
+    AddInput("Y", "Target of SquaredL2DistanceOp.");
+    AddOutput("sub_result",
+              "Buffering substraction result which "
+              "will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out", "Squared l2 distance between input and target.");
+    AddComment(R"DOC(
+    SquaredL2DistanceOp will cacluate the squared L2 distance for
+    input and target. Number of distance value equals to the
+    first dimension of input. First dimension of target could be equal to
+    input or to 1. If the first dimension of target is 1, SquaredL2DistanceOp
+    will broadcast target's first dimension to input's first dimension.
+    You can decide whether calculate the gradient of input and target.
+    )DOC");
+  }
+};
+
+class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Gradient of Out should not be null");
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "First dimension of output gradient and "
+                      "input value must be equal.");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "Second dimension of output gradient "
+                      "must be 1.");
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
+            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
+            ops::SquaredL2DistanceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_distance_grad,
+    ops::SquaredL2DistanceGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3fe62f1a9cb56722ea544b0fed052ac384e799aa
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/squared_l2_distance_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    squared_l2_distance_grad,
+    ops::SquaredL2DistanceGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3347a0b35f3385c5adbcd7ceaa94fe134105e3
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class SquaredL2DistanceKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("sub_result");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    auto in0_dims = in0->dims();
+    auto in1_dims = in1->dims();
+
+    int cols = framework::product(in0_dims) / in0_dims[0];
+    // reduce dimensions except the first
+    auto x =
+        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
+    auto y =
+        EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto sub_result = EigenMatrix<T>::From(*out0);
+    auto z = EigenVector<T>::Flatten(*out1);
+
+    auto place = context.GetEigenDevice<Place>();
+    auto x_dims = x.dimensions();
+    auto y_dims = y.dimensions();
+    // buffer the substraction result
+    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
+      sub_result.device(place) =
+          x -
+          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
+    } else {
+      sub_result.device(place) = x - y;
+    }
+    auto sub_res_pow2 = sub_result * sub_result;
+    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
+  }
+};
+
+template <typename Place, typename T>
+class SquaredL2DistanceGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("sub_result");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto sub_result = EigenMatrix<T>::From(*in0);
+    auto out_grad = EigenMatrix<T>::From(*in1);
+
+    auto x_dims = x_g->dims();
+    auto y_dims = y_g->dims();
+
+    int cols = framework::product(x_dims) / x_dims[0];
+    // calculate gradient
+    auto grad_mat = 2 *
+                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
+                    sub_result;
+
+    // propagate back to input
+    auto eigen_place = context.GetEigenDevice<Place>();
+    if (x_g) {
+      x_g->mutable_data<T>(context.GetPlace());
+      // eigen matrix
+      auto x_grad =
+          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+      // dimensions are same with subResult
+      x_grad.device(eigen_place) = grad_mat;
+    }
+
+    if (y_g) {
+      y_g->mutable_data<T>(context.GetPlace());
+
+      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                        "First dimension of gradient must be greater or "
+                        "equal than first dimension of target.");
+
+      if (sub_result.dimensions()[0] == y_dims[0]) {
+        auto y_grad =
+            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+        y_grad.device(eigen_place) = -1 * grad_mat;
+      } else {
+        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+        auto y_grad = EigenVector<T>::Flatten(*y_g);
+        y_grad.device(eigen_place) = col_sum_res;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 29491137e6d8b4bfa2d0d07d48ffed1212a6131f..f2aeef6c310df8535e67fa3906301a87f8ec4694 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -26,18 +26,17 @@ class CPUUniformRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed =
-        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
     engine.seed(seed);
     std::uniform_real_distribution<T> dist(
-        static_cast<T>(context.op_.GetAttr<float>("min")),
-        static_cast<T>(context.op_.GetAttr<float>("max")));
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -49,11 +48,16 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+    PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
     auto* tensor = ctx.Output<framework::Tensor>("Out");
-    auto dims = GetAttr<std::vector<int>>("dims");
-    tensor->Resize(framework::make_ddim(dims));
+    auto dims = Attr<std::vector<int>>("dims");
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 1d6709934cbbcf50265eabef87c857654f783ed8..c2c041b144b6ca1f019f972e1301b756ec1c9301 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -45,14 +45,13 @@ class GPUUniformRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed =
-        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
-    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
-    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    T min = static_cast<T>(context.Attr<float>("min"));
+    T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     ssize_t N = framework::product(tensor->dims());
     thrust::transform(index_sequence_begin, index_sequence_begin + N,
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 120eb1e4af9cef43e76e27d4ad66acfbbd597a36..17bdac8749e31565b119b2cb84aed199fac0f441 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -22,3 +22,5 @@ ENDIF()
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2841d2a2dbec5c17ef098a06c976ca01247820f5
--- /dev/null
+++ b/paddle/platform/cudnn_helper.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+enum class DataLayout {
+  kNHWC,
+  kNCHW,
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kAverage,
+};
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+};
+
+template <>
+class CudnnDataType<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+};
+
+inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+  switch (order) {
+    case DataLayout::kNHWC:
+      return CUDNN_TENSOR_NHWC;
+    case DataLayout::kNCHW:
+      return CUDNN_TENSOR_NCHW;
+    default:
+      PADDLE_THROW("Unknown cudnn equivalent for order");
+  }
+  return CUDNN_TENSOR_NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& dims) {
+    // the format is not used now, but it maybe useful feature
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+        desc_, type, dims.size(), dims.data(), strides.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& dims) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
+                      dims);
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_));
+  }
+  ~ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_));
+  }
+
+  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& kernel) {
+    // filter layout: output input spatial_dim_y spatial_dim_x
+    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
+        desc_, type, format, kernel.size(), kernel.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& kernel) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
+                      kernel);
+  }
+
+ private:
+  cudnnFilterDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_));
+  }
+  ~ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_));
+  }
+
+  inline cudnnConvolutionDescriptor_t descriptor(
+      cudnnDataType_t type, const std::vector<int>& pads,
+      const std::vector<int>& strides, const std::vector<int>& dilations) {
+    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
+    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
+
+#if CUDNN_VERSION < 6000
+    // cudnn v5 does not support dilation conv, the argument is called upscale
+    // instead of dilations and it is must be one.
+    for (size_t i = 0; i < dilations.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          dilations[i], 1,
+          "Dilations conv is not supported in this cuDNN version");
+    }
+#endif
+
+    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
+        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
+        CUDNN_CROSS_CORRELATION, type));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnConvolutionDescriptor_t descriptor(
+      const std::vector<int>& pads, const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
+  }
+
+ private:
+  cudnnConvolutionDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_));
+  }
+  ~ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_));
+  }
+
+  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
+                                             const std::vector<int>& kernel,
+                                             const std::vector<int>& pads,
+                                             const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
+    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
+    PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
+        desc_, (mode == PoolingMode::kMaximum
+                    ? CUDNN_POOLING_MAX
+                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        CUDNN_PROPAGATE_NAN,  // Always propagate nans.
+        kernel.size(), kernel.data(), pads.data(), strides.data()));
+    return desc_;
+  }
+
+ private:
+  cudnnPoolingDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bd85ae1ca8b47b203e0321e9d9224d5cfd3a586
--- /dev/null
+++ b/paddle/platform/cudnn_helper_test.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cudnn_helper.h"
+#include <gtest/gtest.h>
+
+TEST(CudnnHelper, ScopedTensorDescriptor) {
+  using paddle::platform::ScopedTensorDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedTensorDescriptor tensor_desc;
+  std::vector<int> shape = {2, 4, 6, 6};
+  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  std::vector<int> dims(4);
+  std::vector<int> strides(4);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc, 4, &type, &nd, dims.data(), strides.data());
+
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    EXPECT_EQ(dims[i], shape[i]);
+  }
+  EXPECT_EQ(strides[3], 1);
+  EXPECT_EQ(strides[2], 6);
+  EXPECT_EQ(strides[1], 36);
+  EXPECT_EQ(strides[0], 144);
+}
+
+TEST(CudnnHelper, ScopedFilterDescriptor) {
+  using paddle::platform::ScopedFilterDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedFilterDescriptor filter_desc;
+  std::vector<int> shape = {2, 3, 3};
+  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  cudnnTensorFormat_t format;
+  std::vector<int> kernel(3);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
+                                                        &nd, kernel.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    EXPECT_EQ(kernel[i], shape[i]);
+  }
+}
+
+TEST(CudnnHelper, ScopedConvolutionDescriptor) {
+  using paddle::platform::ScopedConvolutionDescriptor;
+
+  ScopedConvolutionDescriptor conv_desc;
+  std::vector<int> src_pads = {2, 2, 2};
+  std::vector<int> src_strides = {1, 1, 1};
+  std::vector<int> src_dilations = {1, 1, 1};
+  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
+
+  cudnnDataType_t type;
+  cudnnConvolutionMode_t mode;
+  int nd;
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  std::vector<int> dilations(3);
+  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
+      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
+      &type);
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+    EXPECT_EQ(dilations[i], src_dilations[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
+}
+
+TEST(CudnnHelper, ScopedPoolingDescriptor) {
+  using paddle::platform::ScopedPoolingDescriptor;
+  using paddle::platform::PoolingMode;
+
+  ScopedPoolingDescriptor pool_desc;
+  std::vector<int> src_kernel = {2, 2, 5};
+  std::vector<int> src_pads = {1, 1, 2};
+  std::vector<int> src_strides = {2, 2, 3};
+  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
+                                   src_strides);
+
+  cudnnPoolingMode_t mode;
+  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
+  int nd;
+  std::vector<int> kernel(3);
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
+      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(kernel[i], src_kernel[i]);
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index d205ead84598e04eea523be32139959a02e0dd83..ceb66f84b6b01892cbaf61c79a47ae60d2589164 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index ef0dd85b083dc2335dd5c70d3dc5f59eda25daeb..0120625b7c14448f1b8deb88c24a3ee06eaf4f01 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -62,19 +62,27 @@ extern void* cudnn_dso_handle;
 #define CUDNN_DNN_ROUTINE_EACH(__macro)             \
   __macro(cudnnSetTensor4dDescriptor);              \
   __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnSetTensorNdDescriptor);              \
+  __macro(cudnnGetTensorNdDescriptor);              \
   __macro(cudnnGetConvolutionNdForwardOutputDim);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);     \
   __macro(cudnnCreateTensorDescriptor);             \
   __macro(cudnnDestroyTensorDescriptor);            \
   __macro(cudnnCreateFilterDescriptor);             \
   __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetFilterNdDescriptor);              \
+  __macro(cudnnGetFilterNdDescriptor);              \
   __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnSetPoolingNdDescriptor);             \
+  __macro(cudnnGetPoolingNdDescriptor);             \
   __macro(cudnnDestroyFilterDescriptor);            \
   __macro(cudnnCreateConvolutionDescriptor);        \
   __macro(cudnnCreatePoolingDescriptor);            \
   __macro(cudnnDestroyPoolingDescriptor);           \
   __macro(cudnnSetConvolution2dDescriptor);         \
   __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnSetConvolutionNdDescriptor);         \
+  __macro(cudnnGetConvolutionNdDescriptor);         \
   __macro(cudnnCreate);                             \
   __macro(cudnnDestroy);                            \
   __macro(cudnnSetStream);                          \
diff --git a/paddle/operators/scatter_op.cu b/paddle/platform/macros.h
similarity index 66%
rename from paddle/operators/scatter_op.cu
rename to paddle/platform/macros.h
index 6716b478833ff3adb6112cdb1ee25b7f1744ea1f..4a04a38c0c6f905639004dea2f4416ecc57c8620 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/platform/macros.h
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/scatter_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
+// Disable the copy and assignment operator for a class.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+ private:                                  \
+  classname(const classname&) = delete;    \
+  classname& operator=(const classname&) = delete
+#endif
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 5aaa3726640f619d4c632937a319e5b1e306f880..d11355a2caeed4ad91717f41edc14a8c4c718ef4 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-USE_OP(add_two);
+USE_OP(add);
 USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
@@ -39,15 +39,17 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
-USE_OP_ITSELF(recurrent_op);
+USE_NO_KERNEL_OP(recurrent);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
 USE_OP(lookup_table);
 USE_OP(scale);
-USE_OP_ITSELF(identity);
+USE_NO_KERNEL_OP(identity);
 USE_OP(minus);
+USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_OP(squared_l2_distance);
 USE_OP(smooth_l1_loss);
 
 namespace paddle {
@@ -77,7 +79,7 @@ PYBIND11_PLUGIN(core) {
       .def("get_dims",
            [](const Tensor &self) { return vectorize(self.dims()); })
       .def("set_dims",
-           [](Tensor &self, const std::vector<int> &dim) {
+           [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
       .def("alloc_float",
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 39ba60b4dc7ebe3f39a0aa4023b34540b340a841..95171acf729a513e5c92d1e0cba15cb12b38561a 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ void PyCPUTensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::CPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);
@@ -102,7 +102,7 @@ void PyCUDATensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::GPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 7e305e2cd9fbe306368a44d08f7f66b4185ae2d2..248da4ae8d1fb24652625ae8fc9ef314a028b912 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -27,6 +27,14 @@ class SequenceType(object):
     SEQUENCE = 1
     SUB_SEQUENCE = 2
 
+    @classmethod
+    def tostring(cls, value):
+        for k in cls.__dict__:
+            if not k.startswith('__'):
+                if getattr(cls, k) == value:
+                    return cls.__name__ + '.' + k
+        return 'INVALID(' + str(value) + ')'
+
 
 # TODO(yuyang18): Add string data type here.
 class DataType(object):
@@ -35,6 +43,14 @@ class DataType(object):
     SparseValue = 2
     Index = 3
 
+    @classmethod
+    def tostring(cls, value):
+        for k in cls.__dict__:
+            if not k.startswith('__'):
+                if getattr(cls, k) == value:
+                    return cls.__name__ + '.' + k
+        return 'INVALID(' + str(value) + ')'
+
 
 class CacheType(object):
     NO_CACHE = 0  # No cache at all
@@ -69,6 +85,26 @@ class InputType(object):
         self.seq_type = seq_type
         self.type = tp
 
+    def __repr__(self):
+        """
+        Return a human readable representation like 'InputType(dim=25921, 
+            seq_type=SequenceType.NO_SEQUENCE, type=DataType.Dense)'
+        """
+        repr_str = type(self).__name__
+        repr_str += '('
+        serialize_func_map = {
+            'dim': repr,
+            'seq_type': SequenceType.tostring,
+            'type': DataType.tostring
+        }
+        for idx, k in enumerate(self.__slots__):
+            if idx != 0:
+                repr_str += ', '
+            repr_str += (
+                k + '=' + serialize_func_map.get(k, repr)(getattr(self, k)))
+        repr_str += ')'
+        return repr_str
+
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
     """
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 2bd274fad2ab7eed0902ffe944c6e0670f963233..47ac601e678013aceb62005d6f25595f49673d2c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -53,7 +53,7 @@ __all__ = [
     'cos_sim',
     'hsigmoid',
     'conv_projection',
-    'mse_cost',
+    'square_error_cost',
     'regression_cost',
     'classification_cost',
     'LayerOutput',
@@ -4238,13 +4238,18 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
+def square_error_cost(input,
+                      label,
+                      weight=None,
+                      name=None,
+                      coeff=1.0,
+                      layer_attr=None):
     """
-    mean squared error cost:
+    sum of square error cost:
 
     ..  math::
 
-        \\frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
+        cost = \\sum_{i=1}^N(t_i-y_i)^2
 
     :param name: layer name.
     :type name: basestring
@@ -4273,7 +4278,7 @@ def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
-regression_cost = mse_cost
+regression_cost = square_error_cost
 
 
 @wrap_name_default("cost")
@@ -5798,9 +5803,9 @@ def huber_regression_cost(input,
                           coeff=1.0,
                           layer_attr=None):
     """
-    In statistics, the Huber loss is a loss function used in robust regression, 
-    that is less sensitive to outliers in data than the squared error loss. 
-    Given a prediction f(x), a label y and :math:`\delta`, the loss function 
+    In statistics, the Huber loss is a loss function used in robust regression,
+    that is less sensitive to outliers in data than the squared error loss.
+    Given a prediction f(x), a label y and :math:`\delta`, the loss function
     is defined as:
 
     .. math:
@@ -5848,13 +5853,13 @@ def huber_classification_cost(input,
                               coeff=1.0,
                               layer_attr=None):
     """
-    For classification purposes, a variant of the Huber loss called modified Huber 
-    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and 
-    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber 
+    For classification purposes, a variant of the Huber loss called modified Huber
+    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
+    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber
     loss is defined as:
 
     .. math:
-       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1 
+       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1
        loss = -4yf(x), \text{otherwise}
 
     The example usage is:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 96fb1d4ebde08b1bca2ffd09e8db0895842cbfd3..cec8a73db66f6091ec971527b3a42aa9e08154eb 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__mse_cost_0__"
+  name: "__square_error_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -130,7 +130,7 @@ input_layer_names: "label"
 input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
-output_layer_names: "__mse_cost_0__"
+output_layer_names: "__square_error_cost_0__"
 output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
@@ -146,7 +146,7 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__mse_cost_0__"
+  layer_names: "__square_error_cost_0__"
   layer_names: "multi_class_label"
   layer_names: "__nce_layer_0__"
   input_layer_names: "input"
@@ -154,7 +154,7 @@ sub_models {
   input_layer_names: "weight"
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__square_error_cost_0__"
   output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
similarity index 100%
rename from python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
rename to python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index c369062930e2b067ceab0dc3b25ba6c1eabe2450..caa6aaa9430ffaee7ade93ee04ec90103bf8cf43 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,7 +10,7 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    mse_cost(
+    square_error_cost(
         input=fc, label=lbl, weight=wt),
     nce_layer(
         input=fc,
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 6ac656321e72f5b0c91008091753ee50ac8200a6..0349407a851ebb48f69d7daef7a318cf348aad5d 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -94,9 +94,14 @@ class OpDescCreationMethod(object):
                     new_attr.floats.extend(user_defined_attr)
                 elif attr.type == framework_pb2.STRINGS:
                     new_attr.strings.extend(user_defined_attr)
+                elif attr.type == framework_pb2.INT_PAIRS:
+                    for p in user_defined_attr:
+                        pair = new_attr.pairs.add()
+                        pair.first = p[0]
+                        pair.second = p[1]
                 else:
                     raise NotImplementedError("Not support attribute type " +
-                                              attr.type)
+                                              str(attr.type))
 
         return op_desc
 
@@ -179,7 +184,7 @@ class OperatorFactory(object):
 
 class __RecurrentOp__(object):
     __proto__ = None
-    type = 'recurrent_op'
+    type = 'recurrent'
 
     def __init__(self):
         # cache recurrent_op's proto
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 763f3a9f95702401b80679c59f66baa1efcba98b..c420da7dd9328b1c2654694eb7272612b99a7a83 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ py_test(test_scope SRCS test_scope.py)
 
 py_test(test_tensor SRCS test_tensor.py)
 py_test(test_mul_op SRCS test_mul_op.py)
+py_test(test_cos_sim_op SRCS test_cos_sim_op.py)
 
 py_test(test_mean_op SRCS test_mean_op.py)
 
@@ -32,4 +33,5 @@ py_test(test_gradient_checker SRCS test_gradient_checker.py)
 py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(mnist SRCS mnist.py)
+py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
 py_test(test_smooth_l1_loss_op SRCS test_smooth_l1_loss_op.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 518f828bacd60e7cb8375b22c6c3296f9bfeb5ea..fdb06b7988935ebbe53f72f4eba89d75ac2502d4 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -36,13 +36,13 @@ def get_numeric_gradient(op,
                          in_place=False):
     """
     Get Numeric Gradient for an operator's input.
-    
-    :param op: C++ operator instance, could be an network 
-    :param input_values: The input variables. Should be an dictionary, key is 
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, key is
     variable name. Value is numpy array.
-    :param output_name: The final output variable name. 
+    :param output_name: The final output variable name.
     :param input_to_check: The input variable need to get gradient.
-    :param delta: The perturbation value for numeric gradient method. The 
+    :param delta: The perturbation value for numeric gradient method. The
     smaller delta is, the more accurate result will get. But if that delta is
      too small, it could occur numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
@@ -229,9 +229,9 @@ class GradientChecker(unittest.TestCase):
         """Use relative error for the comparison.
 
         :param numeric_grads: the numerical graidents.
-        :type numeric_grads: a list of numpy.array 
+        :type numeric_grads: a list of numpy.array
         :param analytic_grads: the analytical graidents.
-        :type analytic_grads: a list of numpy.array 
+        :type analytic_grads: a list of numpy.array
         :param name: the names of gradients, used to print for debug.
         :type names: a list of string
         :param msg_prefix: string info, used to print for debug.
@@ -286,6 +286,9 @@ class GradientChecker(unittest.TestCase):
         for no_grad in no_grad_set:
             if no_grad not in in_names:
                 raise ValueError("no_grad should be in in_names")
+            if no_grad in inputs_to_check:
+                raise ValueError("no_grad should not be in inputs_to_check")
+
         backward_op = core.Operator.backward(forward_op, no_grad_set)
 
         places = [core.CPUPlace()]
@@ -301,7 +304,6 @@ class GradientChecker(unittest.TestCase):
 
         check_names = [grad_var_name(name) for name in inputs_to_check]
         for place in places:
-            # get analytical gradients according to different device
             analytic_grads = self.__get_gradient(forward_op, backward_op,
                                                  input_vars, check_names, place)
             self.__assert_is_close(numeric_grads, analytic_grads, check_names,
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 3bc05a0feccbbd3d5e7852d85bd3dc8edaccfd07..370f27eaf658dadbf7e82262c118140a10d15c41 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -6,13 +6,13 @@ from paddle.v2.framework.op import Operator
 class OpTestMeta(type):
     """
     Operator Test ClassMeta.
-    
-    It injects `test_all` method into user's OperatorTest class, to make Python 
+
+    It injects `test_all` method into user's OperatorTest class, to make Python
     unittest module run that method.
-    
+
     The `test_all` read what value is stored in `self`. It use self's values to
     create and run a operator, and check whether that op is OK or not.
-    
+
     See `test_add_two_op` for example usage.
     """
 
@@ -66,7 +66,7 @@ class OpTestMeta(type):
                     self.assertTrue(
                         numpy.allclose(
                             actual, expect, atol=1e-05),
-                        "output name: " + out_name + "has diff")
+                        "output name: " + out_name + " has diff")
 
         obj.test_all = test_all
         return obj
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
index 0def484eddb88604398ee10390d3f28058714a57..a578e74eca9a3c4327a4881f853028e2347c98ad 100644
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase):
     __metaclass__ = OpTestMeta
 
     def setUp(self):
-        self.type = "add_two"
+        self.type = "add"
         self.inputs = {
             'X': numpy.random.random((102, 105)).astype("float32"),
             'Y': numpy.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..32013a7999a4be42e5974b9ac751d5d911730994
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -0,0 +1,60 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestCosSimOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((32, 64)).astype("float32"),
+            'Y': np.random.random((32, 64)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("cos_sim")
+        self.inputs = {
+            'X': np.random.random((10, 5)).astype("float32"),
+            'Y': np.random.random((10, 5)).astype("float32")
+        }
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
+        self.check_grad(
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.05)
+
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"Y"})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
index e0b315120862bea284e067070492dcdfbb661081..857427cdfbb4374957e249f0faa4cfc46ac0e8c7 100644
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient
 
 class GetNumericGradientTest(unittest.TestCase):
     def test_add_op(self):
-        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        add_op = Operator('add', X="X", Y="Y", Out="Z")
         x = numpy.random.random((10, 1)).astype("float32")
         y = numpy.random.random((10, 1)).astype("float32")
 
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
index ee0d81a64efcb81bae8b11b856c201a86da274e9..b58e4266d1588a4b6151f5f896537ded6ddd3896 100644
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -16,16 +16,37 @@ class TestMulOp(unittest.TestCase):
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
 
-class MulGradOpTest(GradientChecker):
-    def test_mul(self):
-        op = create_op("mul")
-        inputs = {
+class TestMulGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("mul")
+        self.inputs = {
             'X': np.random.random((32, 84)).astype("float32"),
             'Y': np.random.random((84, 100)).astype("float32")
         }
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
         # mul op will enlarge the relative error
         self.check_grad(
-            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"Y"})
 
 
 # TODO(dzh,qijun) : mulgrad test case need transpose feature of blas library
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 9339cf28dabc95b46b958777200fb1db9dcf284f..e4b7cd480cb36249bb64ba3cab9a4b220d812346 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add_two", X="X", Y="Y", Out="Out")
+        op1 = Operator("add", X="X", Y="Y", Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 1abc4eeb57bcedc81e34b0e156048ee4f5cfdc2d..040556322d79cbb594eb9af585a5b9920d7ab625 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add_two", X="a", Y="b", Out="z")
+        add_op = op.Operator("add", X="a", Y="b", Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index d6000ab9f9d5b969f96128b183f48d49000c8a5e..22e680fd783ec681e95326fb84db34570265cffc 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index 45d569da29d13cf8e2a3cb9d67c2d01e8b365453..2ddb85e2e7a98a08bd1d6e24e6f812f6021142e8 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -16,14 +16,22 @@ class TestRowwiseAddOp(unittest.TestCase):
         self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
 
 
-class RowwiseAddGradOpTest(GradientChecker):
-    def test_rowwise_add(self):
-        op = create_op("rowwise_add")
-        inputs = {
+class TestRowwiseAddGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("rowwise_add")
+        self.inputs = {
             "X": np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
             "b": np.random.uniform(0.1, 1, [10]).astype("float32")
         }
-        self.check_grad(op, inputs, set(["X", "b"]), "Out")
+
+    def test_normal(self):
+        self.check_grad(self.op, self.inputs, ["X", "b"], "Out")
+
+    def test_ignore_b(self):
+        self.check_grad(self.op, self.inputs, ["X"], "Out", no_grad_set={"b"})
+
+    def test_ignore_x(self):
+        self.check_grad(self.op, self.inputs, ["b"], "Out", no_grad_set={"X"})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py b/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bcdf37df434c9a089d75438d876114156261a5c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
@@ -0,0 +1,89 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy as np
+
+
+class TestSquaredL2DistanceOp_f0(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = 'squared_l2_distance'
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1., (32, 64)).astype('float32'),
+            'Y': np.random.uniform(0.1, 1., (32, 64)).astype('float32')
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+
+class TestSquaredL2DistanceOp_f1(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = 'squared_l2_distance'
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1., (32, 64)).astype('float32'),
+            'Y': np.random.uniform(0.1, 1., (1, 64)).astype('float32')
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+
+class TestSquaredL2DistanceOp_f2(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = 'squared_l2_distance'
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1., (32, 64, 128)).astype('float32'),
+            'Y': np.random.uniform(0.1, 1., (1, 64, 128)).astype('float32')
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        sub_res = sub_res.reshape((32, 64 * 128))
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+
+class TestSquaredL2DistanceGradOp(GradientChecker):
+    def test_squared_l2_distance_b0(self):
+        op = create_op("squared_l2_distance")
+        inputs = {
+            'X': np.random.uniform(0.1, .6, (2, 3)).astype('float32'),
+            'Y': np.random.uniform(0.1, .6, (2, 3)).astype('float32')
+        }
+        self.compare_grad(op, inputs)
+        self.check_grad(op, inputs, set(["X", "Y"]), "Out")
+
+    def test_squared_l2_distance_b1(self):
+        op = create_op("squared_l2_distance")
+        inputs = {
+            'X': np.random.uniform(0.1, .6, (2, 3)).astype('float32'),
+            'Y': np.random.uniform(0.1, .6, (1, 3)).astype('float32')
+        }
+        self.compare_grad(op, inputs)
+        self.check_grad(op, inputs, set(["X", "Y"]), "Out")
+
+    def test_squared_l2_distance_b2(self):
+        op = create_op("squared_l2_distance")
+        inputs = {
+            'X': np.random.uniform(0.1, .6, (2, 3, 4)).astype('float32'),
+            'Y': np.random.uniform(0.1, .6, (1, 3, 4)).astype('float32')
+        }
+        self.compare_grad(op, inputs)
+        self.check_grad(op, inputs, set(["X", "Y"]), "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 783a0ca85dc61b9f00ac8126e03788884dfb44cb..de932ad715bea8db158393c3c192ef67502e2fa3 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -134,8 +134,9 @@ class CostLayerTest(unittest.TestCase):
         cost3 = layer.cross_entropy_cost(input=inference, label=label)
         cost4 = layer.cross_entropy_with_selfnorm_cost(
             input=inference, label=label)
-        cost5 = layer.mse_cost(input=inference, label=label)
-        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
+        cost5 = layer.square_error_cost(input=inference, label=label)
+        cost6 = layer.square_error_cost(
+            input=inference, label=label, weight=weight)
         cost7 = layer.multi_binary_label_cross_entropy_cost(
             input=inference, label=label)
         cost8 = layer.rank_cost(left=score, right=score, label=score)