remove conflict

3db3a106 · chengduoZH · ba791f7b · c3b46d16 · 3db3a106 · 3db3a106
327 changed file
--- a/README.md
+++ b/README.md
@@ -51,19 +51,19 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 - **Connected to Products**
    In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products or service with a vast number
+    PaddlePaddle has been deployed into products and services with a vast number
    of users, including ad click-through rate (CTR) prediction, large-scale image
    classification, optical character recognition(OCR), search ranking, computer
    virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also exploit
+    Baidu and it has achieved a significant impact. We hope you can also explore
-    the capability of PaddlePaddle to make a huge impact for your product.
+    the capability of PaddlePaddle to make an impact on your product.
 ## Installation
 It is recommended to check out the
 [Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
 ## Documentation
@@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
-  You might want to start from this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in a Jupyter Notebook.
 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)

--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
 set -e
-unset OMP_NUM_THREADS MKL_NUM_THREADS
-export OMP_DYNAMIC="FALSE"
-export KMP_AFFINITY="granularity=fine,compact,0,0"
 function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS
+  export OMP_DYNAMIC="FALSE"
+  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
  bs=$2
  use_mkldnn=$3

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,11 +49,12 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)

--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
    trainer_config_helpers/optimizers.rst
    trainer_config_helpers/data_sources.rst
    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
    trainer_config_helpers/poolings.rst
    trainer_config_helpers/networks.rst
    trainer_config_helpers/evaluators.rst

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
    :noindex:
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept

--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:
 ```c++
+namespace pd = paddle;
 int x = 10;
-int y = 20;
+int y = 1;
-int out;
+int z = 10;
 bool cond = false;
+int o1, o2;
 if (cond) {
  int z = x + y;
-  out = softmax(z);
+  o1 = z;
+  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
+  int d = pd::layer::fc(z);
-  out = z;
+  o1 = d;
+  o2 = d+1;
 }
 ```
 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd
-x = var(10)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(20)
+y = var(1) # shape=[1], value=1
-cond = var(false)
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
+    d = pd.layer.add_scalar(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
-    ie.set_output(true, 0, operator.softmax(z))
 with ie.false_block():
-    x = ie.inputs(false, 0)
+    d = pd.layer.fc(z)
-    z = layer.fc(x)
+    ie.output(d, d+1)
-    ie.set_output(true, 0, operator.softmax(z))
+o1, o2 = ie(cond)
-out = b(cond)
 ```
-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
 A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
 ### Blocks with `for` and `RNNOp`
 The following RNN model from the [RNN design doc](./rnn.md)
 ```python
-x = sequence([10, 20, 30])
+x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0)
+m = var(0) # shape=[1]
-W = tensor()
+W = var(0.314, param=true) # shape=[1]
-U = tensor()
+U = var(0.375, param=true) # shape=[1]
-rnn = create_rnn(inputs=[input])
+rnn = pd.rnn()
-with rnn.stepnet() as net:
+with rnn.step():
-  x = net.set_inputs(0)
+  h = rnn.memory(init = m)
-  h = net.add_memory(init=m)
+  hh = rnn.previous_memory(h)
-  fc_out = pd.matmul(W, x)
+  a = layer.fc(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
+  b = layer.fc(U, hh)  
-  sum = pd.add_two(fc_out, hidden_out)
+  s = pd.add(a, b)
-  act = pd.sigmoid(sum)
+  act = pd.sigmoid(s)
-  h.update(act)                       # update memory with act
+  rnn.update_memory(h, act)
-  net.set_outputs(0, act, hidden_out) # two outputs
+  rnn.output(a, b)
 o1, o2 = rnn()
-print o1, o2
 ```
 has its equivalent C++ program as follows
 ```c++
 int* x = {10, 20, 30};
-int m = 0;
+int* m = {0};
-int W = some_value();
+int* W = {0.314};
-int U = some_other_value();
+int* U = {0.375};
 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
  int x = x[i-1];
  if (i == 1) mem[0] = m;
-  int fc_out = W * x;
+  int a = W * x;
-  int hidden_out = Y * mem[i-1];
+  int b = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int s = fc_out + hidden_out;
  int act = sigmoid(sum);
  mem[i] = act;
  o1[i] = act;
  o2[i] = hidden_out;
 }
-print_array(o1);
-print_array(o2);
 ```
 ## Compilation and Execution
 Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
@@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
+with rnn.stepnet()
-    x = net.set_inputs(a)
+    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)
 out = rnn()
 ```

--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator
-```python
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
-import paddle as pd
-x = var()
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
-y = var()
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
-cond = var()
-default_value = var()
+## Example
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
+The following PaddlePaddle program shows the usage of the IfElse operator:
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-with b.false_block():
-    x = b.inputs(0)
-    z = layer.fc(x)
-    b.set_output(0, operator.softmax(z))
-out = b(cond)
-```
-If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
-x = var()
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var()
+y = var(1) # shape=[1], value=1
-cond = var()
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-default_value = var()
+cond = larger_than(x, 15) # [false, true, true]
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
+ie = pd.ifelse()
-with b.true_block():
+with ie.true_block():
-    x = b.inputs(0)
+    d = pd.layer.add(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
-    b.set_output(0, operator.softmax(z))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+An equivalent C++ program is as follows:
+```c++
+namespace pd = paddle;
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
--- a/doc/design/program.md
+++ b/doc/design/program.md
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
 ```python
 x = layer.data("images")
@@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```
-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
-```protobuf
+## Programs and Blocks
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
 }
+```
+The following PaddlePaddle program has three blocks:
+```python
+import paddle as pd  // block 0
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+## `BlockDesc` and `ProgramDesc`
+All protobuf messages are defined in `framework.proto`.
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+```protobuf
 message BlockDesc {
  required int32 parent = 1;
  repeated VarDesc vars = 2;
  repeated OpDesc ops = 3;
 }
+```
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+### Global Block
+The global block is the first one in the above array.
+## Operators that Use Blocks
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+The definition of `OpDesc` shows that an operator could have some attributes:
+```protobuf
 message OpDesc {
  AttrDesc attrs = 1;
  ...
 }
+```
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;
-  // index into ProgramDesc::blocks when type==BLOCK
+  enum AttrType {
-  optional int32 block = 2;
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+  optional int32 block = 10; // when type == BLOCK
  ...
 }
 ```
-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
+## InferShape
-A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
 With this design, the InferShape function should take the following parameters:

--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
+# Design Doc: Python API
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+## Core Concepts
+### Program
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+```python
+class Program(objects):
+    def __init__(self):
+        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+    def global_block():
+        return self.blocks[0]
+    def current_block():
+        return self.get_block(self.current_block)
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+### Block
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.proto = core.NewBlock(program.proto)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+    def create_var(self, ...):
+        return Variable(self, ...)
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+### Operator
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
+        core.infer_shape(self.proto, inputs, outputs)
+    def type(self):
+        return self.proto.type()
+```
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+### Variable
+Operators take Variables as its inputs and outputs.
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.writer = None
+```
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+### Parameter
+A parameter is a global variable with an initializer (or load) operator.
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+When users create a parameter, they can call
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+## Layer Functions
+A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
+### Data Layer
+```python
+def data_layer(name, type, column_name):
+    block = the_current_program.glolal_block()
+    var = block.create_global_var(
+            name=name,
+            shape=[None] + type.dims(),
+            dtype=type.dtype)
+    block.prepend_operator(block,
+                           type="Feed",
+                           inputs = None,
+                           outputs = [var],
+                           {column_name: column_name})
+    return var
+```
+The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
+### FC Layer
+```python
+def fc_layer(input, size, ...):
+    block = program.current_block()
+    w = block.create_parameter(...)
+    b = block.create_parameter(...)
+    out = block.create_var()
+    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
+    out.writer = op
+    return out
+```
--- a/doc/design/refactor/session.md
+++ b/doc/design/refactor/session.md
+# Design Doc: Session
+## Abstract
+The *session* object encapsulates the environment in which the
+computation graph is executed.
+We will have the *local* session and *remote* session, they offer the
+same [interface](#interface). The local session encapsulates the local
+runtime environment and the remote session encapsulates the cluster
+runtime environment.
+The local runtime environment contains:
+1. computation devices (i.e., CPU, GPU) handles, and
+1. the [scope](../scope.md) which holds all variables.
+The remote runtime environment contains:
+1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
+   and
+1. the distributed [scope](../scope.md) in a cluster which holds all
+   variables.
+The user can create a remote session on Paddle Cloud and evaluate the
+computation graph with it. In this way, the user can control the
+remote computation resource in a cluster from his local computer.
+## Background
+The current design has an implicit global session in which
+`paddle.eval()` is executed. The pain point is:
+Since the user is not able to explicitly switch between runtime
+environments, the user cannot run a topology in two independent
+environments.
+For example, in reinforcement learning, the user may want to have a
+stale model for inference and a fresh model for training, and only
+replace the stale model with the fresh model periodically.
+Furthermore, we have no concept that encapsulates a remote environment
+that executes a computation graph.
+We need the session object to address above issues.
+## Session
+A session is an object that owns the runtime environment. All
+computations are executed through `session.eval()`.
+### Interface
+```python
+eval(
+    targets,
+    feed_dict=None,
+)
+```
+Evaluates the target Operations or Variables in `targets`.
+- *targets*: the evaluation targets. Can be a single Operation or
+  Variable, or a list with the Operations or Variables as
+  elements. The value returned by `eval()` has the same shape as the
+  `target` argument.
+  The PaddlePaddle program is represented by
+  the [ProgramDesc](../design/program.md), `eval()` will infer the
+  ProgramDesc from the given targets and run the PaddlePaddle
+  program. Please
+  see
+  [this graph](./distributed_architecture.md#local-training-architecture) for
+  the detailed illustration for the local session
+  and
+  [this graph](./distributed_architecture.md#distributed-training-architecture) for
+  the detailed illustration for the remote session.
+- *feed_dict*: a dictionary that contains the tensors which override
+  the edges of the computation graph.
+  feed_dict not only can provide the input data, it can override any
+  OP's input as well:
+  ```python
+  a = pd.constant(2.0, name="a")
+  b = pd.variable(name="b")
+  c = pd.mul(a,b)
+  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
+  ```
+```python
+close()
+```
+Closes the session and releases the scope that the session owns.
+### Create a Local Session
+```python
+session(
+    devices=None
+)
+```
+Creates a new session. One session owns one global scope, so creating
+multiple sessions will create different scopes.
+- *devices*: a single `string` or a list of `string` of device names,
+  the corresponding devices will be the computation devices for
+  `eval()`. If not specified, all available devices (e.g., all GPUs)
+  will be used. The user doesn't need to specify the CPU device since
+  it will be always used. Multiple sessions can use the same device.
+#### Example
+```Python
+a = paddle.constant(1.0)
+b = paddle.constant(2.0)
+c = a + b
+sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
+sess.eval(c)
+sess.close()
+```
+### Create a Remote Session
+```python
+create_cloud_job(
+    name,
+    num_trainer,
+    mem_per_trainer,
+    gpu_per_trainer,
+    cpu_per_trainer,
+    num_ps,
+    mem_per_ps,
+    cpu_per_ps,
+)
+```
+Creates a Paddle Cloud job. Fails if the job name exists.
+```python
+get_cloud_job(
+    name
+)
+```
+Gets a Paddle Cloud job.
+```python
+remote_session(
+    job
+)
+```
+- *job*: the Paddle Cloud job.
+#### Example
+```Python
+reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
+image = reader.column(0)
+label = reader.column(1)
+fc1 = paddle.op.fc(image, size=256, act="sigmoid")
+fc2 = paddle.op.fc(fc1, size=10, act="softmax")
+cost = paddle.op.cross_entropy(fc2, label)
+opt = paddle.optimizer.sgd(cost)
+job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
+sess = paddle.remote_ession(job)
+for i in range(1000):
+    sess.eval(opt)
+sess.close()
+```
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
 # Design Doc: Refactorization Overview
-The goal of refactorizaiton include:
+The goals of refactoring include:
-1. Make it easy for external contributors to write new elementory computaiton operations.
+1. Making it easy for external contributors to write new elementary computation operations.
-1. Make the codebase clean and readable.
+1. Making the codebase clean and readable.
-1. Introduce a new design of computation representation -- a computation graph of operators and variables.
+1. Designing a new computation representation -- a computation graph of operators and variables.
-1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
 ## Computation Graphs
-1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-  1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
-1. Users write Python programs to describe the graphs and run it (locally or remotely).
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
 1. A graph is composed of *variables* and *operators*.
-1. The description of graphs must be able to be serialized/deserialized, so it
+1. The description of graphs must be capable of being serialized/deserialized, so that:
-   1. could to be sent to the cloud for distributed execution, and
+   1. It can to be sent to the cloud for distributed execution, and
-   1. be sent to clients for mobile or enterprise deployment.
+   1. It can be sent to clients for mobile or enterprise deployment.
-1. The Python program do
+1. The Python program does the following steps
-   1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
+   1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
+   1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
-## Description and Realization
+## Description and Realization of Computation Graph
-At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph.
-At runtime, the C++ program realizes the graph and run it.
+At runtime, the C++ program realizes the graph and runs it.
 | | Representation (protobuf messages) | Realization (C++ class objects) |
 |---|---|---|
@@ -42,30 +42,31 @@ At runtime, the C++ program realizes the graph and run it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|
-The word *graph* is exchangable with *block* in this document.  A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
+The word *graph* is interchangeable with *block* in this document.  A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 ## Compilation and Execution
-1. Run an applicaton Python program to describe the graph.  In particular,
+1. Run an application Python program to describe the graph.  In particular, the Python application program does the following:
-   1. create VarDesc to represent local/intermediate variables,
+   1. Create `VarDesc` to represent local/intermediate variables,
-   1. create operators and set attributes,
+   1. Create operators and set attributes,
-   1. validate attribute values,
+   1. Validate attribute values,
-   1. inference the type and the shape of variables,
+   1. Infer the type and the shape of variables,
-   1. plan for memory-reuse for variables,
+   1. Plan memory-reuse for variables,
-   1. generate backward and optimization part of the Graph.
+   1. Generate the backward graph
-   1. possiblly split the graph for distributed training.
+   1. Optimize the computation graph.
+   1. Potentially, split the graph for distributed training.
-1. The invocation of `train` or `infer` in the application Python program:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following:
-   1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
      1. a scope is similar to the stack frame in programming languages,
-   1. create an instance of class `Block`, in which,
+   1. Create an instance of class `Block`, in which,
      1. realize operators in the BlockDesc message,
-   1. run the Block by calling
+   1. Run the Block by calling
      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
      1. `Block::Eval(vector<Operator>* targets)` for optimization.
@@ -76,14 +77,14 @@ The word *graph* is exchangable with *block* in this document.  A graph represen
 Compile Time -> IR -> Runtime
 ```
-### Benefit
+### Benefits of IR
 - Optimization
  ```text
  Compile Time -> IR -> Optimized IR -> Runtime
  ```
- Send automatically partitioned IR to different nodes.
+- Automatically send partitioned IR to different nodes.
-  - Automatic data parallel
+  - Automatic Data Parallelism
    ```text
    Compile Time
    |-> Single GPU IR
@@ -92,7 +93,7 @@ Compile Time -> IR -> Runtime
            |-> Node-1 (runs trainer-IR-1)
            |-> Node-2 (runs pserver-IR)
    ```
-  - Automatic model parallel (planned for future)
+  - Automatic Model Parallelism (planned for future)
 ---
@@ -105,10 +106,10 @@ Compile Time -> IR -> Runtime
 # Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
-* `Operator` is the fundamental building block as the user interface.
+* `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable name, and attributes.
+    * Operator stores input/output variable names, and attributes.
-    * The `InferShape` interface is used to infer output variable shapes by its input shapes.
+    * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables.
-    * Use `Run` to compute `input variables` to `output variables`.
+    * Use `Run` to compute the `output` variables from the `input` variables.
 ---
@@ -126,30 +127,29 @@ Compile Time -> IR -> Runtime
 # Why separate Kernel and Operator
 * Separate GPU and CPU code.
-    * Make Paddle can run without GPU.
+    * Make Paddle capable of running without GPU.
-* Make one operator (which is user interface) can contain many implementations.
+* Make one operator (which is a user interface) and create many implementations.
-    * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 # Libraries for Kernel development
 * `Eigen::Tensor` contains basic math and element-wise functions.
    * Note that `Eigen::Tensor` has broadcast implementation.
-    * Limit number of `tensor.device(dev) = ` in your code.
+    * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::tranform` and `std::transform`.
+* `thrust::transform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
+    * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
-    * Do not write `.h`. CPU Kernel should be in `.cc`. GPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Register
+# Operator Registration
-## Why register is necessary?
+## Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
-## How to do the register?
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
-Maintain a map, whose key is the type name and value is corresponding Op constructor.
 ---
 # The Registry Map
@@ -169,7 +169,7 @@ Maintain a map, whose key is the type name and value is corresponding Op constru
 # Related Concepts
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
 ### Register Macros
 ```cpp
@@ -177,34 +177,34 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
 REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
-### `USE` Macros
+### USE Macros
-make sure the registration process is executed and linked.
+Make sure the registration process is executed and linked.
 ---
-# Register Process
+# Registration Process
-1. Write Op class, as well as its gradient Op class if there is.
+1. Write an Op class and its gradient Op class, if required.
-2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
-3. Invoke macro `REGISTER_OP`. The macro will
+3. Invoke the macro `REGISTER_OP`. This macro will
-	1. call maker class to complete `proto` and `checker`
+	1. Call maker class to complete the `proto` and the `checker`
-	2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-4. Invoke `USE` macro in where the Op is used to make sure it is linked.
+4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked.
 ---
 # Backward Module (1/2)
 ### Create Backward Operator
- Mapping from forwarding Op to backward Op
+- Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 ---
 # Backward Module (2/2)
 ### Build Backward Network
- **Input** graph of forwarding operators
+- **Input**: graph of forward operators
- **Output** graph of backward operators
+- **Output**: graph of backward operators
- **corner case in construction**
+- **Corner cases in construction**
-	- shared variable => insert `Add` operator
+	- Shared Variables => insert an `Add` operator to combine gradients
-	- no gradient => insert `fill_zero_grad` operator
+	- No Gradient => insert a `fill_zero_grad` operator
-	- recursive netOp => call `Backward` recursively
+	- Recursive NetOp => call `Backward` recursively
 	- RNN Op => recursively call `Backward` on stepnet
@@ -213,41 +213,41 @@ make sure the registration process is executed and linked.
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
-	* All operators on `Tensor` is written in `Operator` or global functions.
+	* All operations on `Tensor` are written in `Operator` or global functions.
-	* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
+* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`.
-	* step_scopes in RNN is a variable and not a tensor.
+	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables store at.
+* `Scope` is where variables are stores.
-	* map<string/*var name */, Variable>
+	* map<string `variable_name`, Variable>
-	* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 ---
 # Block (in design)
-## the difference with original RNNOp
+## the difference between original RNNOp and Block
- as an operator is more intuitive than `RNNOp`,
+- As an operator is more intuitive than `RNNOp`,
- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- fits the compile-time/ runtime separation design.
+- Fits the compile-time/ runtime separation design paradigm.
-  - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
-  - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 ---
 # Milestone
- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- model migration
+- Model migration
-  - framework development gives **priority support** to model migration, for example,
+  - Framework development gives **priority support** to model migration, for example,
    - the MNIST demo needs a Python interface,
    - the RNN models require the framework to support `LoDTensor`.
-  - determine some timelines,
+  - Determine some timelines,
-  - heavily-relied Ops need to be migrated first,
+  - Frequently used Ops need to be migrated first,
-  - different models can be migrated parallelly.
+  - Different models can be migrated in parallel.
- improve the framework at the same time
+- Improve the framework at the same time
- accept imperfection, concentrated on solving the specific problem at the right price.
+- Accept imperfection, concentrate on solving the specific problem at the right price.
 ---
 # Control the migration quality
- compare the performance of migrated models with old ones.
+- Compare the performance of migrated models with old ones.
- follow google C style
+- Follow the google C++ style
- build the automatic workflow of generating Python/C++ documentations
+- Build the automatic workflow of generating Python/C++ documentations.
-  - the documentation of layers and ops should be written inside the code
+  - The documentation of layers and ops should be written inside the code.
-  - take the documentation quality into account when doing PR
+  - Take the documentation quality into account when submitting pull requests.
-  - preview the documentations, read and improve them from users' perspective
+  - Preview the documentations, read and improve them from a user's perspective.
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
+# Design Doc: Gradient Operators Registration
+## The Problem Posed
+In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+## Current Implementation
+OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+map<string, OpInfo> OpInfoMap;
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+## Proposed Solution
+The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
+The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+The user interface should be
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+ private:
+  vector<LoDTensor> values_;
+};
+```
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+```python
+input = pd.data(...)
+num_steps = Var(12)
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+step = Var(1)
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+output = step_outputs.stack()
+```
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+An RNN can be implemented with the following pseudocode
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow, 
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
+such as `recurrent_op`, `RecurrentGradientMachine`.
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+## Dynamic-operations on TensorArray
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = NewVar(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+    def size(self, output):
+        '''
+        Return the number of values.
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+Since each step of RNN can only take a tensor-represented batch of data as input, 
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+Some definitions are like
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+With these two methods, a varience-length sentence supported RNN can be implemented like
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+for (int step = 0; step = ta.size(); step++) {
+  auto state = states.read(step);
+  // rnnstep is a function which acts like a step of RNN
+  auto step_input = ta.read(step);
+  auto step_output = rnnstep(step_input, state);
+  step_outputs.write(step_output, true/*data_shared*/);
+}
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 - 在 `.cu`文件中注册GPU Kernel。
@@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 下面解释代码中一些关键的地方:
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 ### 编译和执行单元测试

--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -182,7 +182,7 @@ Note that **different devices (CPU, GPU)share an Op definition; whether or not t
 `MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 - Registering GPU Kernel in `.cu` files
@@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 Some key points in the code above include:
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
- `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
+  - The second variable `"Out"` points to the network's final output target `Out`.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
 ### Compiling and Running

--- a/doc/howto/dev/use_eigen_en.md
+++ b/doc/howto/dev/use_eigen_en.md
+## How to use Eigen in Paddle
+Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
+### Eigen Tensor Module
+The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
+Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
+For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
+### paddle::framework::Tensor
+Paddle Tensor's is defined in the framework directory with the following interface:
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+### paddle::framework::Tensor Usage
+`AddOp` demonstrates Tensor's usage.
+- InferShape
+When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+- Run
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+  output->mutable_data<T>(context.GetPlace());
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+  auto place = context.GetEigenDevice<Place>();
+  z.device(place) = x + y;
+}
+```
+### paddle::framework::Tensor到EigenTensor的转换
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+Using EigenTensor as an example:
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
+In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
+### Implementing Computation
+While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
+Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
  return false;
 #else
  return true;

--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
  paddle::real* buf = ptr->mat->getRowBuf(rowID);
  size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
  std::copy(rowArray, rowArray + width, buf);

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,16 +19,15 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator)
-cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -42,3 +41,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
+cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -21,20 +21,12 @@ limitations under the License. */
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
 namespace paddle {
 namespace framework {
-// The order should be as same as framework.proto
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<bool>, BlockDesc*>
-    Attribute;
-typedef std::unordered_map<std::string, Attribute> AttributeMap;
 ProgramDesc& GetProgramDesc();
 template <typename T>

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,10 +13,13 @@
   limitations under the License. */
 #include "paddle/framework/backward.h"
+#include "paddle/operators/net_op.h"
+#include <deque>
 #include <list>
 #include <memory>
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@@ -24,6 +27,35 @@
 namespace paddle {
 namespace framework {
+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op) {
+  OpDescBind op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc);
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
 template <typename Map, typename T>
 static void ForEachVarName(const Map& names, T callback) {
  for (auto& name : names) {
@@ -141,9 +173,26 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        net->ops_[op_offset]->Rename(name, dup_outputs.back());
      }
      // collect all the offset to append `add` op for each alias
-      insert_position.push_back(
+      //
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
+      // one variable is shared between multiple operators.
-                                               {{"Out", {name}}}, {})});
+      // insert add operator one by one, then add it to output
+      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+           ++output_idx) {
+        auto insert_add_x = dup_outputs[output_idx];
+        auto insert_add_y = dup_outputs[output_idx + 1];
+        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+        // first add op inserted
+        if (output_idx == dup_outputs.size() - 2) {
+          insert_add_out = name;
+        }
+        if (output_idx != 0) {
+          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
+        }
+        insert_position.push_back(
+            {dup_op.back(),
+             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
+                                  {{"Out", {insert_add_out}}}, {})});
+      }
    }
    // make sure the inserted `add` ops follow the BFS order.
@@ -154,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
      net->InsertOp(pos.first + 1, std::move(pos.second));
    }
  } else {
-    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                          const std::string& grad_input) {
@@ -182,7 +231,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    // process recurrent gradient op as a special operator.
    if (forwardOp.Type() == "recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or
      // this will result in infinite loop.
      const auto& rnnop =
          *static_cast<const operators::RecurrentOp*>(&forwardOp);
@@ -222,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward(
  return BackwardRecursive(forwardOp, no_grad_names, uid);
 }
+// ====================================  //
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  return true;
+}
+std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
+    const std::unique_ptr<OpDescBind>& op_desc,
+    std::unordered_set<std::string>& no_grad_vars) {
+  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculat.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, no_grad_vars)) {
+    return grad_op_descs;  // empty vector
+  }
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+  if (AllGradInSet(outputs, no_grad_vars)) {
+    for (const std::string& name : inputs) {
+      no_grad_vars.insert(GradVarName(name));
+    }
+    return grad_op_descs;  // empty vector
+  }
+  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
+  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars.count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
+            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+    for (const std::string& out_name : desc->OutputArgumentNames()) {
+      if (no_grad_vars.count(out_name)) {
+        desc->Rename(out_name, kEmptyVarName);
+      }
+    }
+  }
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+    ProgramDescBind& program_desc, int block_idx,
+    std::unordered_set<std::string>& no_grad_vars) {
+  BlockDescBind* cur_block = program_desc.Block(block_idx);
+  std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    std::vector<std::unique_ptr<OpDescBind>> op_grads =
+        MakeOpGrad(*it, no_grad_vars);
+    if ((*it)->Type() == "recurrent") {
+      PADDLE_ENFORCE_EQ(
+          op_grads.size(), size_t(1),
+          "rnn_op's gradient process should contain only one op.");
+      int step_block_idx = (*it)->GetBlockAttr("stop_block");
+      auto backward_block_op_descs =
+          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
+      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
+      for (auto& ptr : backward_block_op_descs) {
+        backward_block->ops_.push_back(std::move(ptr));
+      }
+      op_grads[0]->SetBlockAttr("step_block", *backward_block);
+    }
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(
+        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
+        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+  }
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        sum_op_inputs.emplace_back(new_name);
+      }
+      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
+          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+  pending_sum_ops.sort(
+      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
+         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
+        return a.first > b.first;
+      });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+  return backward_descs;
+}
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+  const int root_block_idx = 0;
+  auto backward_op_descs =
+      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
+  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
+  for (auto& ptr : backward_op_descs) {
+    forw_op_descs.push_back(std::move(ptr));
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -13,8 +13,11 @@
   limitations under the License. */
 #pragma once
 #include <unordered_set>
-#include "operator.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
 namespace paddle {
 namespace framework {
@@ -23,5 +26,9 @@ namespace framework {
 extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -15,30 +15,42 @@
 #include "paddle/framework/backward.h"
 #include <gtest/gtest.h>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 namespace paddle {
 namespace framework {
-using OperatorBase = framework::OperatorBase;
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using Scope = framework::Scope;
 using DeviceContext = platform::DeviceContext;
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
 public:
  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add").NotInGradient();
+    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add").NotInGradient();
+    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add").NotInGradient();
+    AddOutput("Out", "Out of Add");
    AddComment("Add Op");
  }
 };
+class RowWiseAddGradMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<OpDescBind> Apply() const override {
+    auto grad_op = new OpDescBind();
+    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
+    grad_op->SetType("rowwise_add_grad");
+    return std::unique_ptr<OpDescBind>(grad_op);
+  }
+};
 class MulOpMaker : public OpProtoAndCheckerMaker {
 public:
  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -133,42 +145,46 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  }
 };
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x").AsDuplicable();
+    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "out");
+    AddOutput("Out", "the output tensor of sum operator.");
    AddComment("");
  }
 };
+class MultInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("H", "h");
+    AddOutput("Y", "y");
+    AddOutput("Z", "z");
+    AddComment("");
+  }
+};
 }  // namespace framework
 }  // namespace paddle
 namespace f = paddle::framework;
 namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
-REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
+REGISTER_OPERATOR(rowwise_add, f::NOP, f::RowWiseAddOpMaker,
-            f::NOP);
+                  f::RowWiseAddGradMaker);
+REGISTER_OPERATOR(rowwise_add_grad, f::NOP);
 REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
 REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
 REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
            f::NOP);
+REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP);
-TEST(Backward, simple_op_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::OpRegistry::CreateGradOp(*fwd);
-  ASSERT_EQ(1UL, gop->Inputs().size());
-  ASSERT_EQ("rowwise_add_grad", gop->Type());
-  ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
-  ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
-}
 TEST(Backward, simple_op_not_need_grad) {
  auto fwd = f::OpRegistry::CreateOp(
@@ -283,18 +299,7 @@ TEST(Backward, net_shared_weight) {
  ASSERT_TRUE(bwd->IsNetOp());
  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("add", bwd_net->ops_[2]->Type());
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-TEST(Backward, op_register_grad_not_for_network) {
-  auto fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_out"}},
-                               {"add_result", {"add_out"}},
-                               {"Out", {"out1"}}},
-                              {{"temporary_index", std::vector<int>{0, 1}}});
-  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
 }
 TEST(Backward, op_all_input_are_not_need) {
@@ -399,3 +404,293 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
 }
+// =================================== //
+f::ProgramDesc *GetNewProgramDesc() {
+  auto *program_desc = new f::ProgramDesc();
+  auto *root_block = program_desc->add_blocks();
+  root_block->set_idx(0);
+  root_block->set_parent_idx(-1);
+  return program_desc;
+}
+TEST(Backward, simple_single_op) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+  op->SetType("rowwise_add");
+  op->SetInput("X", {"x"});
+  op->SetInput("b", {"b"});
+  op->SetOutput("Out", {"out"});
+  AppendBackward(program, {});
+  ASSERT_EQ(block->AllOps().size(), 2UL);
+  f::OpDescBind *grad_op = block->AllOps()[1];
+  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b")}));
+}
+TEST(Backward, simple_mult_op) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+  AppendBackward(program, {});
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+  f::OpDescBind *grad_op2 = block->AllOps()[4];
+  EXPECT_EQ(grad_op2->Type(), "mul_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+}
+TEST(Backward, intermedia_var_no_grad) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"x2"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+  f::OpDescBind *op4 = block->AppendOp();
+  op4->SetType("mul");
+  op4->SetInput("X", {"out1"});
+  op4->SetInput("Y", {"out3"});
+  op4->SetOutput("Out", {"out4"});
+  AppendBackward(program, {"out3"});
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  EXPECT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out4")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::kEmptyVarName}));
+}
+TEST(Backward, var_no_grad) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("mult_in_out");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("H", {"h1"});
+  op1->SetOutput("Y", {"y1"});
+  op1->SetOutput("Z", {"z1"});
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mult_in_out");
+  op2->SetInput("X", {"y1"});
+  op2->SetInput("H", {"z1"});
+  op2->SetOutput("Y", {"y2"});
+  op2->SetOutput("Z", {"z2"});
+  AppendBackward(program, {"z1"});
+  ASSERT_EQ(block->AllOps().size(), 5UL);
+  f::OpDescBind *grad_op2 = block->AllOps()[2];
+  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
+            std::vector<std::string>({f::GradVarName("z2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::kEmptyVarName}));
+  f::OpDescBind *fill_zero_op = block->AllOps()[3];
+  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
+  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
+  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(fill_zero_op->Output("Y"),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  f::OpDescBind *grad_op1 = block->AllOps()[4];
+  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
+  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
+  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::GradVarName("h1")}));
+}
+TEST(Backward, shared_var) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out1"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+  AppendBackward(program, {});
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  ASSERT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  f::OpDescBind *sum_op = block->AllOps()[5];
+  ASSERT_EQ(sum_op->Type(), "sum");
+  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
+  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(sum_op->Input("X"),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
+                                      f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(sum_op->Output("Out"),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+}
\ No newline at end of file
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+VarDescBind *BlockDescBind::NewVar(const std::string &name) {
+  need_update_ = true;
+  auto it = vars_.find(name);
+  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
+  auto var = new VarDescBind(name);
+  vars_[name].reset(var);
+  return var;
+}
+VarDescBind *BlockDescBind::Var(const std::string &name) const {
+  auto it = vars_.find(name);
+  PADDLE_ENFORCE(it != vars_.end(),
+                 "Can not find variable %s in current block.", name);
+  return it->second.get();
+}
+bool BlockDescBind::HasVar(const std::string &name) const {
+  return vars_.find(name) != vars_.end();
+}
+std::vector<VarDescBind *> BlockDescBind::AllVars() const {
+  std::vector<VarDescBind *> res;
+  for (const auto &p : vars_) {
+    res.push_back(p.second.get());
+  }
+  return res;
+}
+OpDescBind *BlockDescBind::AppendOp() {
+  need_update_ = true;
+  ops_.emplace_back(new OpDescBind());
+  return ops_.back().get();
+}
+OpDescBind *BlockDescBind::PrependOp() {
+  need_update_ = true;
+  ops_.emplace_front(new OpDescBind());
+  return ops_.front().get();
+}
+std::vector<OpDescBind *> BlockDescBind::AllOps() const {
+  std::vector<OpDescBind *> res;
+  for (const auto &op : ops_) {
+    res.push_back(op.get());
+  }
+  return res;
+}
+void BlockDescBind::Sync() {
+  if (need_update_) {
+    auto &op_field = *this->desc_->mutable_ops();
+    op_field.Clear();
+    op_field.Reserve(static_cast<int>(ops_.size()));
+    for (auto &op_desc : ops_) {
+      op_field.AddAllocated(op_desc->Proto());
+    }
+    need_update_ = false;
+  }
+}
+BlockDescBind *BlockDescBind::ParentBlock() const {
+  if (this->desc_->parent_idx() == -1) {
+    return nullptr;
+  }
+  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
+}
+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.RawPtr();
+  this->attrs_[name] = desc;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <deque>
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/var_desc.h"
+#include "paddle/platform/macros.h"
+namespace paddle {
+namespace framework {
+class ProgramDescBind;
+// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
+// read/write speed. Only when we want the protobuf message, the local changes
+// will be synchronized (by `Sync` method).
+class BlockDescBind {
+ public:
+  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+      ProgramDescBind &program_desc, int block_idx,
+      std::unordered_set<std::string> &no_grad_vars);
+  friend void AppendBackward(
+      ProgramDescBind &program_desc,
+      const std::unordered_set<std::string> &no_grad_vars);
+  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+      : prog_(prog), desc_(desc), need_update_(false) {}
+  int32_t ID() const { return desc_->idx(); }
+  int32_t Parent() const { return desc_->parent_idx(); }
+  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes) const;
+  bool HasVar(const std::string &var_name) const;
+  std::vector<VarDescBind *> AllVars() const;
+  BlockDescBind *ParentBlock() const;
+  OpDescBind *AppendOp();
+  OpDescBind *PrependOp();
+  std::vector<OpDescBind *> AllOps() const;
+  void Sync();
+  BlockDesc *RawPtr() { return desc_; }
+ private:
+  ProgramDescBind *prog_;  // not_own
+  BlockDesc *desc_;        // not_own
+  bool need_update_;
+  std::deque<std::unique_ptr<OpDescBind>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+namespace paddle {
+namespace framework {
+inline DataType ToDataType(std::type_index type) {
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else {
+    PADDLE_THROW("Not supported");
+    return static_cast<DataType>(-1);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace framework {
+namespace details {
+enum OpInfoFillType {
+  kOperator = 0,
+  kOpProtoAndCheckerMaker = 1,
+  kGradOpDescMaker = 2
+};
+template <typename T>
+struct OpInfoFillTypeID {
+  static constexpr OpInfoFillType ID() {
+    return std::is_base_of<OperatorBase, T>::value
+               ? kOperator
+               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
+                      ? kOpProtoAndCheckerMaker
+                      : (std::is_base_of<GradOpDescMakerBase, T>::value
+                             ? kGradOpDescMaker
+                             : static_cast<OpInfoFillType>(-1)));
+  }
+};
+template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
+struct OpInfoFiller;
+template <size_t I, bool at_end, typename... ARGS>
+class OperatorRegistrarRecursive;
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, false, ARGS...> {
+ public:
+  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
+    OpInfoFiller<T> fill;
+    fill(op_type, info);
+    constexpr auto size = sizeof...(ARGS);
+    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
+                                                                  info);
+    (void)(reg);
+  }
+};
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, true, ARGS...> {
+ public:
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
+};
+template <typename T>
+struct OpInfoFiller<T, kOperator> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
+                        const VariableNameMap& outputs,
+                        const AttributeMap& attrs) {
+      return new T(type, inputs, outputs, attrs);
+    };
+  }
+};
+template <typename T>
+struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->proto_ = new OpProto;
+    info->checker_ = new OpAttrChecker();
+    auto maker = T(info->proto_, info->checker_);
+    maker.Validate();
+    info->proto_->set_type(op_type);
+    PADDLE_ENFORCE(
+        info->proto_->IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, info->proto_->InitializationErrorString());
+  }
+};
+template <typename T>
+struct OpInfoFiller<T, kGradOpDescMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
+      T maker(fwd_op);
+      return maker();
+    };
+  }
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -66,7 +66,6 @@ message OpProto {
    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
-    optional bool not_in_gradient = 5 [ default = false ];
  }
  // AttrProto describes the C++ type Attribute.
@@ -106,6 +105,7 @@ message LoDTensorDesc {
 message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
+  optional bool persistable = 3 [ default = false ];
 }
 message BlockDesc {
@@ -115,4 +115,7 @@ message BlockDesc {
  repeated OpDesc ops = 4;
 }
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
 message ProgramDesc { repeated BlockDesc blocks = 1; }
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
-express or implied. See the License for the specific language governing
-permissions and limitations under the License. */
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_registry.h"
-namespace paddle {
-namespace framework {
-enum class OpArgType { IN, OUT };
-static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, VariableNameMap* vars) {
-  const auto& src_inout =
-      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
-  auto& dst_inout = *vars;
-  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
-    for (auto& var_name : src_inout.at(src_name)) {
-      std::string s = is_grad ? GradVarName(var_name) : var_name;
-      dst_inout[dst_name].emplace_back(s);
-    }
-  }
-}
-OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
-  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
-  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
-  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
-#include "paddle/framework/grad_op_builder.h"
-#include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-USE_OP(add);
-namespace paddle {
-namespace framework {
-class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable();
-    AddInput("In3", "another single input");
-    AddOutput("Out1", "a single output");
-    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
-    AddComment("test op with multiple inputs and outputs");
-  }
-};
-class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
- public:
-  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
-    AddInput("In3_mult", "another multiple input").AsDuplicable();
-    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
-    AddOutput("Out2", "a single output").NotInGradient();
-    AddComment("op with inputs and outputs ignored in gradient calculating");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-namespace f = paddle::framework;
-TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_add_op =
-      f::OpRegistry::CreateGradOp(*add_op);
-  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
-  EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
-  EXPECT_EQ(grad_add_op->Input("X"), "x");
-  EXPECT_EQ(grad_add_op->Input("Y"), "y");
-  EXPECT_EQ(grad_add_op->Input("Out"), "out");
-  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
-}
-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
-TEST(GradOpBuilder, MutiInOut) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "mult_io", {{"In1", {"in1"}},
-                  {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
-                  {"In3", {"in3"}}},
-      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-  ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
-  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
-            f::GradVarName("out1"));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
-}
-TEST(GradOpBuilder, IOIgnoredInGradient) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "io_ignored", {{"In1", {"in1"}},
-                     {"In2_mult", {"in2_1", "in2_2"}},
-                     {"In3_mult", {"in3_1", "in3_2"}}},
-      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
-            f::GradVarName("out2"));
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-}
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace framework {
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+ protected:
+  static std::vector<std::string> ToGradNames(
+      const std::vector<std::string>& var_names) {
+    std::vector<std::string> ret_val;
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val), GradVarName);
+    return ret_val;
+  }
+  std::vector<std::string> InputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Input(name));
+  }
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Output(name));
+  }
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
+  }
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
+  }
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+ private:
+  const OpDescBind& fwd_op_;
+};
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+};
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const {
+    auto* grad = new OpDescBind();
+    grad->SetType(this->GradOpType());
+    for (auto& input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+    }
+    for (auto& output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+    grad->SetAttrMap(this->Attrs());
+    return std::unique_ptr<OpDescBind>(grad);
+  }
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -72,6 +72,22 @@ bool operator==(const LoD& a, const LoD& b) {
  return true;
 }
+size_t LoDTensor::NumElements(size_t level, size_t idx) const {
+  PADDLE_ENFORCE_LT(level, NumLevels());
+  PADDLE_ENFORCE_LT(idx, NumElements(level));
+  // the last level of LoD, just return number of records in Tensor
+  if (level == NumLevels() - 1) {
+    return lod_[level][idx + 1] - lod_[level][idx];
+  }
+  // high level of LoD, and there is another lower level, return number of
+  // lower-level elements
+  auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
+  PADDLE_ENFORCE_GE(tmp.size(), 2);
+  // there is a 0 as a placeholder stored in LoD, so the number of elements
+  // equals lod.size() - 1
+  return tmp[1].size() - 1;
+}
 void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
  lod_ = new_lod;

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <memory>
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/system/cuda/experimental/pinned_allocator.h>
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace framework {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 template <typename T>
 using Vector = std::vector<T>;
 #else
@@ -38,6 +38,18 @@ using Vector = thrust::host_vector<
    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
 #endif
+/*
+ * 3-level LoD stores
+ *
+ * 0 10 20
+ * 0 5 10 15 20
+ * 0 2 5 7 10 12 15 20
+ *
+ * - in a level, each element indicates offset in the underlying Tensor
+ * - the first element should be 0 and that indicates that this sequence start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ */
 using LoD = std::vector<Vector<size_t>>;
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
@@ -65,11 +77,8 @@ class LoDTensor : public Tensor {
   * Get a element from LoD.
   */
  size_t lod_element(size_t level, size_t elem) const {
-    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+    PADDLE_ENFORCE_LT(level, NumLevels());
-                   NumLevels());
+    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    PADDLE_ENFORCE(elem < NumElements(level),
-                   "element begin [%d] out of range [%d]", elem,
-                   NumElements(level));
    return (lod_)[level][elem];
  }
@@ -82,12 +91,23 @@ class LoDTensor : public Tensor {
   * Number of elements in a level.
   */
  size_t NumElements(size_t level = 0) const {
-    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+    PADDLE_ENFORCE_LT(level, NumLevels());
-                   NumLevels());
    // the last offset is the end of last element
    return (lod_)[level].size() - 1;
  }
+  /*
+   * Number of lower-level elements.
+   * For example, a 2-level lod-tensor
+   *
+   * 0-th level   |   |
+   * 1-th level   ||  |||
+   *
+   * NumElements(0, 0) get 2
+   * NumElements(0, 1) get 3
+   */
+  size_t NumElements(size_t level, size_t idx) const;
  /*
   * Shrink levels[level_begin:level_end]
   */

--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
 # Design Doc: LoD (Level-of-Detail) Tensor
-PaddlePaddle's RNN doesn't require that all instances have the same length.  To do so, we introduce an extension to Tensor, namely, LoD Tensor.
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
-## Challenge of Variable-length Inputs
+|                       | TensorFlow | PaddlePaddle |
+|-----------------------|------------|--------------|
+| RNN                   | Support    | Support      |
+| recursive RNN         | Support    | Support      |
+| padding zeros         | Must       | No need      |
+| blob data type        | Tensor     | LoDTensor    |
-People usually represent a mini-batch by a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  So a transformation, T, of all images can be a matrix multiplication of the 10xOx32-dimensional tensor T and the 10x32x32 Tensor.
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
-Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector.  If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor.  However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
-## LoD as a Solution
+## The Challenge: Variable-length Sequences
-### Mini-Batch of variable-length sentences
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
-Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+## A Solution: The LoD Index
+To understand our solution, it is best to look at some examples.
+### A Mini-Batch of Sentences
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
 ```
-   3
 3   1 2
 ||| | ||
 ```
-Each `|` represents a D-dimensional word vectors.  The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+### Recursive Sequences
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
-### Mini-Batch of variable-length videos
+### A Mini-Batch of Videos
-This approach generalizes to the case where elements are not words, but higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively.  The underlying tensor is of size (3+1+2)x640x480.  The index information illustrates as:
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
 ```
-     3
 3     1  2
 口口口 口 口口
 ```
-where each `口` represents an image.
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
-### Mini-Batch of fixed-size images
+### A Mini-Batch of Images
-Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images.  The LoD Tensor representation is
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
 ```
-     M
 1 1 1 1     1
 口口口口 ... 口
 ```
-The many 1's on the second level seem duplicated.  For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
-### Design and summarization
-In summary, as long as that the essential elements (words  or images) have the same size, we can represent mini-batches by a LoD Tensor:
+```
+口口口口 ... 口
+```
- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
+### Model Parameters
- The first dimension size L has an additonal property -- a LoD index as a nested vector:
-  ```c++
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
-  typedef std::vector<std::<vector>> LoD;
-  ```
- The LoD index is not necessary when there are only two levels and all elements of the second level have length 1.
-## Slicing of LoD Tensor
+## The LoD Tensor
-Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words.  This network requires that mini-batches represented by 3 level LoD Tensor, for example,
+Let us revisit above example of the 2-level LoD Tensor
 ```
-         3
 3           1  2
 3   2  4    1  2  3
 ||| || |||| |  || |||
 ```
-To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+### The LoD Index
-For example, the <2,1>-slice of above slice is
+We can save the LoD index in the above example
 ```
-2
+3           1  2
-||
+3   2  4    1  2  3
 ```
-and the <1,2>-slice of above example is
+in a not-full 2D matrix:
+```c++
+typedef std::vector<std::vector<int> > LoD;
 ```
-2
-2  3
-|| |||
-```
-Let's go on slicing this slice.  Its <1,1>-slice is
+where
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+## The Offset Representation
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+In the above example, we accumulate the length of elementary sequences:
 ```
-1
+3 2 4 1 2 3
-1
-|
 ```
-### The Slicing Algorithm
+into offsets
-The algorithm, with over-simplified data structure, is defined as
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
-```c++
+so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
-typedef std::vector<std::vector<int>> LoD;
-struct LoDTensor {
+Similarly, the lengths in the top level LoD
-  LoD lod_;
-  float* tensor_;
-};
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+3 1 2
 ```
-Let us revisit the example above
+are transformed into offsets of elements/words as follows:
 ```
-         3
+0 9     10  15
-3           1  2
+  =     =   =
-3   2  4    1  2  3
+  3+2+4 1+9 2+3+10
-||| || |||| |  || |||
 ```
-Suppose that we want to retrieve the <1,2>-slice
+so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
+The complete offset representation is as follows:
 ```
-2
+0           9 10       15
-2  3
+0   3  5    9 10  12   15
-|| |||
+ ||| || |||| |  ||  |||
 ```
-we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+## Slicing of LoD Tensors
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
-To avoid the traversal of the LoD tree at slicing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+For example, the <2>-slice of above example is
 ```
-        0
+10      15
-0           9  10
+10  12  15
-0   3  5    9  10 12
+  || |||
-||| || |||| |  || |||
 ```
-We don't really need the 0 on top, so the LoD Tensor could be
+and the <2,0>-slice of above slice is
 ```
-0           9  10
+10  12
-0   3  5    9  10 12
+  ||
-||| || |||| |  || |||
 ```
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -56,6 +56,12 @@ TEST_F(LoDTensorTester, NumElements) {
  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
+TEST_F(LoDTensorTester, NumElements2) {
+  ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL);
+}
 TEST_F(LoDTensorTester, ShrinkLevels) {
  // slice 1 level
  for (size_t level = 0; level < 3UL; ++level) {
@@ -65,7 +71,7 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
  }
-  // slice 2 level
+  // shrink 2 level
  for (size_t level = 0; level < 2UL; ++level) {
    LoDTensor new_lod_tensor = lod_tensor_;
    new_lod_tensor.ShrinkLevels(level, level + 2);

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/block_desc.h"
+namespace paddle {
+namespace framework {
+OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs) {
+  op_desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+}
+OpDesc *OpDescBind::Proto() {
+  Sync();
+  return &op_desc_;
+}
+const std::vector<std::string> &OpDescBind::Input(
+    const std::string &name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
+                 Type());
+  return it->second;
+}
+std::vector<std::string> OpDescBind::InputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->inputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+void OpDescBind::SetInput(const std::string &param_name,
+                          const std::vector<std::string> &args) {
+  need_update_ = true;
+  inputs_[param_name] = args;
+}
+const std::vector<std::string> &OpDescBind::Output(
+    const std::string &name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
+                 name, Type());
+  return it->second;
+}
+std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->outputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+void OpDescBind::SetOutput(const std::string &param_name,
+                           const std::vector<std::string> &args) {
+  need_update_ = true;
+  this->outputs_[param_name] = args;
+}
+AttrType OpDescBind::GetAttrType(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return static_cast<AttrType>(it->second.which() - 1);
+}
+std::vector<std::string> OpDescBind::AttrNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(attrs_.size());
+  for (auto &attr : attrs_) {
+    retv.push_back(attr.first);
+  }
+  return retv;
+}
+void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+  this->attrs_[name] = v;
+  need_update_ = true;
+}
+void OpDescBind::SetAttrMap(
+    const std::unordered_map<std::string, Attribute> &attr_map) {
+  attrs_ = attr_map;
+  need_update_ = true;
+}
+Attribute OpDescBind::GetAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return it->second;
+}
+int OpDescBind::GetBlockAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return boost::get<BlockDesc *>(it->second)->idx();
+}
+const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
+    const {
+  return attrs_;
+}
+void OpDescBind::Rename(const std::string &old_name,
+                        const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+  mutable OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+void OpDescBind::Sync() {
+  if (need_update_) {
+    this->op_desc_.mutable_inputs()->Clear();
+    for (auto &ipt : inputs_) {
+      auto *input = op_desc_.add_inputs();
+      input->set_parameter(ipt.first);
+      VectorToRepeated(ipt.second, input->mutable_arguments());
+    }
+    this->op_desc_.mutable_outputs()->Clear();
+    for (auto &opt : outputs_) {
+      auto *output = op_desc_.add_outputs();
+      output->set_parameter(opt.first);
+      VectorToRepeated(opt.second, output->mutable_arguments());
+    }
+    this->op_desc_.mutable_attrs()->Clear();
+    for (auto &attr : attrs_) {
+      auto *attr_desc = op_desc_.add_attrs();
+      attr_desc->set_name(attr.first);
+      attr_desc->set_type(
+          static_cast<framework::AttrType>(attr.second.which() - 1));
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
+    }
+    need_update_ = false;
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/framework/var_desc.h"
+namespace paddle {
+namespace framework {
+class BlockDescBind;
+class OpDescBind {
+ public:
+  OpDescBind() {}
+  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs, const AttributeMap &attrs);
+  OpDesc *Proto();
+  std::string Type() const { return op_desc_.type(); }
+  void SetType(const std::string &type) { op_desc_.set_type(type); }
+  const std::vector<std::string> &Input(const std::string &name) const;
+  std::vector<std::string> InputArgumentNames() const;
+  void SetInput(const std::string &param_name,
+                const std::vector<std::string> &args);
+  const std::vector<std::string> &Output(const std::string &name) const;
+  std::vector<std::string> OutputArgumentNames() const;
+  void SetOutput(const std::string &param_name,
+                 const std::vector<std::string> &args);
+  std::string DebugString() { return this->Proto()->DebugString(); }
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+  AttrType GetAttrType(const std::string &name) const;
+  std::vector<std::string> AttrNames() const;
+  void SetAttr(const std::string &name, const Attribute &v);
+  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  Attribute GetAttr(const std::string &name) const;
+  int GetBlockAttr(const std::string &name) const;
+  void Rename(const std::string &old_name, const std::string &new_name);
+  // Only be used in C++
+  const AttributeMap &GetAttrMap() const;
+  // Only be used in C++
+  void SetAttrMap(const AttributeMap &attr_map);
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
+  void Sync();
+  const VariableNameMap &Inputs() const { return inputs_; }
+  const VariableNameMap &Outputs() const { return outputs_; }
+ private:
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), std::back_inserter(ret_val),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
+  OpDesc op_desc_;
+  VariableNameMap inputs_;
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
+  // need_update_ indicate there some local changes not be synchronized. If
+  // local changes should be synchronized, need_update_ should be set to true.
+  bool need_update_{false};
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,21 +19,18 @@
 #include <unordered_map>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/platform/macros.h"
 namespace paddle {
 namespace framework {
-class OperatorBase;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 struct OpInfo {
  OpCreator creator_;
-  std::string grad_op_type_;
+  GradOpMakerFN grad_op_maker_;
-  OpProto* proto_;
+  OpProto* proto_{nullptr};
-  OpAttrChecker* checker_;
+  OpAttrChecker* checker_{nullptr};
  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
@@ -46,30 +43,25 @@ struct OpInfo {
    return *proto_;
  }
-  const OpAttrChecker& Checker() const {
-    PADDLE_ENFORCE_NOT_NULL(checker_,
-                            "Operator Checker has not been registered");
-    return *checker_;
-  }
  const OpCreator& Creator() const {
    PADDLE_ENFORCE_NOT_NULL(creator_,
                            "Operator Creator has not been registered");
    return creator_;
  }
-  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+  const OpAttrChecker* Checker() const { return checker_; }
 };
 class OpInfoMap {
 public:
  static OpInfoMap& Instance();
-  OpInfoMap(const OpInfoMap& o) = delete;
-  OpInfoMap(OpInfoMap&& o) = delete;
-  OpInfoMap& operator=(const OpInfoMap& o) = delete;
-  OpInfoMap& operator=(OpInfoMap&& o) = delete;
  bool Has(const std::string& op_type) const {
    return map_.find(op_type) != map_.end();
  }
@@ -105,6 +97,8 @@ class OpInfoMap {
 private:
  OpInfoMap() = default;
  std::unordered_map<std::string, const OpInfo> map_;
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
 };
 }  // namespace framework

--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -44,11 +44,6 @@ class OpProtoAndCheckerMaker {
      var_->set_intermediate(true);
      return *this;
    }
-    VariableBuilder& NotInGradient() {
-      var_->set_not_in_gradient(true);
-      return *this;
-    }
  };
  VariableBuilder AddInput(const std::string& name, const std::string& comment);

--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -48,4 +48,4 @@ TEST(ProtoMaker, DuplicatedInOut) {
  paddle::framework::OpAttrChecker op_checker;
  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
\ No newline at end of file
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const std::string& type, const VariableNameMap& inputs,
    const VariableNameMap& outputs, AttributeMap attrs) {
  auto& info = OpInfoMap::Instance().Get(type);
-  info.Checker().Check(attrs);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
  auto op = info.Creator()(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
 }
@@ -52,9 +54,15 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
  return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
-std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
-  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
-  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
+                  op_desc.GetAttrMap());
+}
+std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
+    const OpDescBind& op_desc) {
+  auto& info = OpInfoMap::Instance().Get(op_desc.Type());
+  return info.grad_op_maker_(op_desc);
 }
 }  // namespace framework

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -21,49 +21,54 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/details/op_registry.h"
 #include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/grad_op_desc_maker.h"
-#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_proto_maker.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 namespace paddle {
 namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which,
+  // however, are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+template <typename... ARGS>
+struct OperatorRegistrar : public Registrar {
+  explicit OperatorRegistrar(const char* op_type) : op_type(op_type) {
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
+    static_assert(sizeof...(ARGS) != 0,
+                  "OperatorRegistrar should be invoked at least by OpClass");
+    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
+  }
+  const char* op_type;
+  OpInfo info;
+};
 class OpRegistry {
 public:
  template <typename OpType, typename ProtoMakerType, typename GradOpType>
  static void RegisterOp(const std::string& op_type,
                         const std::string& grad_op_type) {
-    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
-                   "'%s' is registered more than once.", op_type);
+    reg.info.grad_op_type_ = grad_op_type;
-    OpInfo op_info;
-    op_info.creator_ = [](
-        const std::string& type, const VariableNameMap& inputs,
-        const VariableNameMap& outputs, const AttributeMap& attrs) {
-      return new OpType(type, inputs, outputs, attrs);
-    };
-    op_info.grad_op_type_ = grad_op_type;
-    if (std::type_index(typeid(ProtoMakerType)) !=
-        std::type_index(typeid(NOPMaker))) {
-      op_info.proto_ = new OpProto;
-      op_info.checker_ = new OpAttrChecker;
-      auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
-      maker.Validate();
-      op_info.proto_->set_type(op_type);
-      PADDLE_ENFORCE(
-          op_info.proto_->IsInitialized(),
-          "Fail to initialize %s's OpProto, because %s is not initialized",
-          op_type, op_info.proto_->InitializationErrorString());
-    } else {
-      op_info.proto_ = nullptr;
-      op_info.checker_ = nullptr;
-    }
-    OpInfoMap::Instance().Insert(op_type, op_info);
    // register gradient op
    if (!grad_op_type.empty()) {
-      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
+      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
    }
  }
@@ -74,20 +79,10 @@ class OpRegistry {
  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
-  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
+  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
-};
+      const OpDescBind& op_desc);
-class Registrar {
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
 };
 template <typename OpType, typename ProtoMakerType, typename GradOpType>
@@ -100,13 +95,39 @@ class OpRegistrar : public Registrar {
  }
 };
-template <typename PlaceType, typename KernelType>
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+  void operator()(const char* op_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+                                        PlaceType());
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type);
+  }
+};
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type) const {}
+};
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
  explicit OpKernelRegistrar(const char* op_type) {
-    OperatorWithKernel::OpKernelKey key;
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    key.place_ = PlaceType();
+    func(op_type);
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
  }
 };
@@ -119,33 +140,41 @@ class OpKernelRegistrar : public Registrar {
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
+  class _OpClass_##op_type##_ : public op_class {                      \
+   public:                                                             \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
+  };                                                                   \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
+  int TouchOpRegistrar_##op_type() {                                   \
+    __op_registrar_##op_type##__.Touch();                              \
+    return 0;                                                          \
+  }
 /**
 * Macro to register Operator.
 */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,           \
-                    grad_op_class)                                            \
+                    grad_op_class)                                             \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                              \
-      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
+  class _GradOpDescMaker_##grad_op_type##_                                     \
-  class _OpClass_##op_type##_ : public op_class {                             \
+      : public ::paddle::framework::DefaultGradOpDescMaker {                   \
-   public:                                                                    \
+    using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                            \
+                                                                               \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                   \
+   protected:                                                                  \
-  };                                                                          \
+    virtual std::string GradOpType() const { return #grad_op_type; }           \
-  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
+  };                                                                           \
-   public:                                                                    \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,     \
-    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
+                    op_maker_class);
-    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
-  };                                                                          \
-  static ::paddle::framework::OpRegistrar<                                    \
-      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
-  int TouchOpRegistrar_##op_type() {                                          \
-    __op_registrar_##op_type##__.Touch();                                     \
-    return 0;                                                                 \
-  }
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 /**
 * Macro to register OperatorKernel.
@@ -192,7 +221,7 @@ class OpKernelRegistrar : public Registrar {
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -10,7 +10,6 @@ class CosineOp : public OperatorBase {
  using OperatorBase::OperatorBase;
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const Scope& scope) const override {}
 };
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -29,7 +28,6 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
 };
@@ -175,3 +173,14 @@ TEST(OpRegistry, CustomChecker) {
  int test_attr = op->Attr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);
 }
+class CosineOpComplete : public paddle::framework::CosineOp {
+ public:
+  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
+  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
+};
+TEST(OperatorRegistrar, Test) {
+  using namespace paddle::framework;
+  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+}
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.GetEigenDevice<platform::GPUPlace>();
 }
 #endif
@@ -245,5 +245,12 @@ std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
  return res;
 }
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
+     << "]";
+  return os;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -15,12 +15,15 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
+#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "op_info.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
@@ -82,10 +85,6 @@ class OperatorBase {
  virtual std::string DebugString() const;
-  /// InferShape infer the size of Variables used by this Operator with
-  /// information inside scope
-  virtual void InferShape(const Scope& scope) const = 0;
  /// Net will call this function to Run an op.
  virtual void Run(const Scope& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;
@@ -163,7 +162,6 @@ class OperatorBase {
 class NOP : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
  std::unique_ptr<OperatorBase> Clone() const override {
@@ -299,21 +297,6 @@ template <>
 std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
    const std::string& name) const;
-template <typename T>
-struct EigenDeviceConverter;
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
 class ExecutionContext : public InferShapeContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -321,8 +304,8 @@ class ExecutionContext : public InferShapeContext {
      : InferShapeContext(op, scope), device_context_(device_context) {}
  template <typename PlaceType,
-            typename DeviceType =
+            typename DeviceType = typename platform::EigenDeviceConverter<
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+                PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;
  platform::Place GetPlace() const { return device_context_.GetPlace(); }
@@ -335,46 +318,170 @@ class ExecutionContext : public InferShapeContext {
  const platform::DeviceContext& device_context_;
 };
+class CompileTimeInferShapeContext : public InferShapeContextBase {
+ public:
+  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
+      : op_(op), block_(block) {}
+  bool HasInput(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    auto length = input_names.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVar(input_names[0]);
+  }
+  bool HasOutput(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    auto length = output_names.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVar(output_names[0]);
+  }
+  bool HasInputs(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
+    for (auto& input : input_names) {
+      if (!block_.HasVar(input)) return false;
+    }
+    return true;
+  }
+  bool HasOutputs(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
+    for (auto& output : output_names) {
+      if (!block_.HasVar(output)) return false;
+    }
+    return true;
+  }
+  DDim GetInputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetInputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
+  void SetInputDim(const std::string& name, const DDim& dim) override {
+    SetInputsDim(name, {dim});
+  }
+  DDim GetOutputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetOutputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetOutputsDim(name, {dim});
+  }
+  AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Input(name);
+  }
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Output(name);
+  }
+ private:
+  DDim GetDim(const std::string& name) const override {
+    return framework::make_ddim(block_.Var(name)->Shape());
+  }
+  void SetDim(const std::string& name, const DDim& dim) override {
+    block_.Var(name)->SetShape(framework::vectorize(dim));
+  }
+  const OpDescBind& op_;
+  const BlockDescBind& block_;
+};
 class RuntimeInferShapeContext : public InferShapeContextBase {
 public:
  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
      : op_(op), scope_(scope) {}
-  bool HasInput(const std::string& name) const {
+  bool HasInput(const std::string& name) const override {
    auto ipt = op_.Input(name);
    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
    return var != nullptr;
  }
-  bool HasOutput(const std::string& name) const {
+  bool HasOutput(const std::string& name) const override {
    auto ipt = op_.Output(name);
    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
    return var != nullptr;
  }
-  DDim GetInputDim(const std::string& name) const {
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+  DDim GetInputDim(const std::string& name) const override {
    return GetDim(op_.Input(name));
  }
-  void SetInputDim(const std::string& name, const DDim& dim) {
+  void SetInputDim(const std::string& name, const DDim& dim) override {
    SetDim(op_.Input(name), dim);
  }
-  DDim GetOutputDim(const std::string& name) const {
+  DDim GetOutputDim(const std::string& name) const override {
    return GetDim(op_.Output(name));
  }
-  void SetOutputDim(const std::string& name, const DDim& dim) {
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
    SetDim(op_.Output(name), dim);
  }
-  AttrReader Attrs() const { return AttrReader(op_.Attrs()); }
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-  const std::vector<std::string>& Inputs(const std::string& name) const {
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
    return op_.Inputs(name);
  }
-  const std::vector<std::string>& Outputs(const std::string& name) const {
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
    return op_.Outputs(name);
  }
@@ -395,11 +502,11 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
    return t;
  }
-  DDim GetDim(const std::string& name) const {
+  DDim GetDim(const std::string& name) const override {
    return GetTensor<false>(name)->dims();
  }
-  void SetDim(const std::string& name, const DDim& dim) {
+  void SetDim(const std::string& name, const DDim& dim) override {
    GetTensor<true>(name)->Resize(dim);
  }
@@ -407,7 +514,7 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
  const Scope& scope_;
 };
-class OpKernel {
+class OpKernelBase {
 public:
  /**
   * ExecutionContext is the only parameter of Kernel Run function.
@@ -418,48 +525,77 @@ class OpKernel {
  virtual void Compute(const ExecutionContext& context) const = 0;
-  virtual ~OpKernel() {}
+  virtual ~OpKernelBase() = default;
+};
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
 };
 class OperatorWithKernel : public OperatorBase {
 public:
  struct OpKernelKey {
    platform::Place place_;
+    DataType data_type_;
-    OpKernelKey() = default;
+    OpKernelKey(DataType data_type, platform::Place place)
-    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
+        : place_(place), data_type_(data_type) {}
-      place_ = dev_ctx.GetPlace();
-    }
+    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
+        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_);
+      return platform::places_are_same_class(place_, o.place_) &&
+             data_type_ == o.data_type_;
    }
  };
  struct OpKernelHash {
-    std::hash<bool> hash_;
+    std::hash<int> hash_;
    size_t operator()(const OpKernelKey& key) const {
-      return hash_(platform::is_gpu_place(key.place_));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_);
+      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
+                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
+      return hash_(pre_hash);
    }
  };
  using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+                         OpKernelHash>;
  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  // runtime infershape
-  void InferShape(const Scope& scope) const override {
-    auto c = RuntimeInferShapeContext(*this, scope);
-    InferShape(&c);
-  }
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
+    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
+    this->InferShape(&infer_shape_ctx);
+    ExecutionContext ctx(*this, scope, dev_ctx);
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(type_);
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel", type_);
+    }
+    // check if op[type] have kernel for kernel_key
+    OpKernelMap& kernels = kernels_iter->second;
+    auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+    auto kernel_iter = kernels.find(kernel_key);
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
+                   kernel_key);
+    }
+    kernel_iter->second->Compute(ctx);
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -469,14 +605,47 @@ class OperatorWithKernel : public OperatorBase {
  }
  bool SupportGPU() const override {
-    OperatorWithKernel::OpKernelKey key;
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
-    key.place_ = platform::GPUPlace();
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
-    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
  }
- protected:
  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+ protected:
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    auto& scope = ctx.scope();
+    int data_type = -1;
+    for (auto& input : this->inputs_) {
+      for (auto& ipt_name : input.second) {
+        auto* var = scope.FindVar(ipt_name);
+        if (var != nullptr) {
+          const Tensor* t = nullptr;
+          if (var->IsType<Tensor>()) {
+            t = &var->Get<Tensor>();
+          } else if (var->IsType<LoDTensor>()) {
+            t = &var->Get<LoDTensor>();
+          }
+          if (t != nullptr) {
+            int tmp = static_cast<int>(ToDataType(t->type()));
+            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                           "DataType of Paddle Op must be same.");
+            data_type = tmp;
+          }
+        }
+      }
+    }
+    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+    return static_cast<DataType>(data_type);
+  }
 };
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -27,7 +27,6 @@ class OpWithoutKernelTest : public OperatorBase {
  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    ++op_run_num;
@@ -87,7 +86,6 @@ TEST(OperatorBase, all) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  scope.NewVar("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->InferShape(scope);
  op->Run(scope, device_context);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
@@ -116,10 +114,13 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+    return DataType::FP32;
+  }
 };
 template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel {
+class CPUKernelTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    std::cout << "this is cpu kernel" << std::endl;
@@ -146,7 +147,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
  }
 };
-class CPUKernalMultiInputsTest : public OpKernel {
+class CPUKernalMultiInputsTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    auto xs = ctx.op().Inputs("xs");
@@ -255,7 +256,6 @@ class OperatorClone : public paddle::framework::OperatorBase {
                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void InferShape(const paddle::framework::Scope& scope) const override {}
  void Run(const paddle::framework::Scope& scope,
           const paddle::platform::DeviceContext& dev_ctx) const override {}
 };

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/block_desc.h"
+namespace paddle {
+namespace framework {
+using ProgDescMap =
+    std::unordered_map<ProgramDesc *, std::unique_ptr<ProgramDescBind>>;
+static ProgDescMap *g_bind_map = nullptr;
+ProgramDescBind &ProgramDescBind::Instance(ProgramDesc *prog) {
+  if (g_bind_map == nullptr) {
+    g_bind_map = new ProgDescMap();
+  }
+  auto &map = *g_bind_map;
+  auto &ptr = map[prog];
+  if (ptr == nullptr) {
+    ptr.reset(new ProgramDescBind(prog));
+  }
+  return *ptr;
+}
+BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+  auto *b = prog_->add_blocks();
+  b->set_parent_idx(parent.ID());
+  b->set_idx(prog_->blocks_size() - 1);
+  blocks_.emplace_back(new BlockDescBind(this, b));
+  return blocks_.back().get();
+}
+ProgramDesc *ProgramDescBind::Proto() {
+  for (auto &block : blocks_) {
+    block->Sync();
+  }
+  return prog_;
+}
+ProgramDescBind::ProgramDescBind(ProgramDesc *prog) {
+  prog_ = prog;
+  for (auto &block : *prog->mutable_blocks()) {
+    blocks_.emplace_back(new BlockDescBind(this, &block));
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/macros.h"
+namespace paddle {
+namespace framework {
+class BlockDescBind;
+class ProgramDescBind {
+ public:
+  static ProgramDescBind &Instance(ProgramDesc *prog);
+  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
+  std::string DebugString() { return Proto()->DebugString(); }
+  size_t Size() const { return blocks_.size(); }
+  ProgramDesc *Proto();
+ private:
+  explicit ProgramDescBind(ProgramDesc *prog);
+  // Not owned
+  ProgramDesc *prog_;
+  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  DISABLE_COPY_AND_ASSIGN(ProgramDescBind);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/framework/variable.h"
+#include "paddle/platform/macros.h"
 namespace paddle {
 namespace framework {
@@ -38,11 +39,6 @@ class Scope {
  Scope() {}
  ~Scope();
-  // Disable Copy, Assign, Move.
-  Scope(const Scope& other) = delete;
-  Scope& operator=(const Scope& other) = delete;
-  Scope(Scope&& other) = delete;
  /// Create a sub-scope. Returns a reference other than a pointer so
  /// to prevent from manual deletion.
  /// Mark it to const because that new kid scope cannot change parent scope.
@@ -73,6 +69,8 @@ class Scope {
  std::unordered_map<std::string, Variable*> vars_;
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};
+  DISABLE_COPY_AND_ASSIGN(Scope);
 };
 }  // namespace framework

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -19,11 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+// TODO(longfei): Once after both CompileTimeInferShapeContext and
+// RuntimeInferShapeContext get merged, we can rename InferShapeContextBase into
+// InferShapeContext so to replace the current InferShapeContext.
 class InferShapeContextBase {
 public:
  virtual ~InferShapeContextBase() {}
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
  std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
    const std::vector<std::string> &names = Inputs(name);

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,20 +29,10 @@ limitations under the License. */
 namespace paddle {
-namespace pybind {
-namespace details {
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}
-}  // namespace pybind
 namespace framework {
 class Tensor {
 public:
-  template <bool less, size_t i, typename... args>
-  friend struct pybind::details::CastToPyBufferImpl;
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
@@ -119,6 +109,8 @@ class Tensor {
    return holder_->place();
  }
+  std::type_index type() const { return holder_->type(); }
 private:
  template <typename T>
  inline void check_memory_size() const;

--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+/*
+ * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
+ * after lod-tensor's re-assembling, its info can be used to recover the order
+ * in original lod-tensor.
+ */
+struct DySeqMeta {
+  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
+      : begin(begin), end(end), ori_idx(ori_idx) {}
+  size_t begin;
+  size_t end;  // not included
+  size_t ori_idx;
+};
+/*
+ * TensorArray is a C-array-like array of tensors, it is meant to be used with
+ * dynamic iteration primitives such as while_loop. It is used to segment inputs
+ * and store states in all time steps.
+ *
+ * By providing some methods similar to a C++ array, the difinition of some
+ * state-based dynamic models such as RNN cound be more natural and highly
+ * flexible.
+ */
+class TensorArray {
+ public:
+  using value_type = float;
+  // max number of values allowed to store.
+  const size_t MAX_SIZE{100000};
+  /*
+   * Read the value at location `index` in the `TensorArray`.
+   */
+  const LoDTensor &Read(size_t index) const;
+  /*
+   * Write value into the index of the TensorArray.
+   */
+  void Write(size_t index, const LoDTensor &value);
+  /*
+   * Write value into the index of the TensorArray, with memory shared.
+   */
+  void WriteShared(size_t index, const LoDTensor &value);
+  /*
+   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
+   * `indice_map`.
+   */
+  LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
+                 const LoD &lod) const;
+  /*
+   * Split LoDTensor in some `level` and write the generated batches to
+   * `values`, if set `desend`, will sort by length in descending order else in
+   * ascending order.
+   */
+  std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
+                                bool length_desend);
+  /*
+   * Pack the values into a tensor with rank one higher than each tensor in
+   * values.
+   */
+  LoDTensor Stack() const;
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
+   */
+  void Unstack(const LoDTensor &source) const;
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
+   * with memory of tensors shared.
+   */
+  void UnstackShared(const LoDTensor &source) const;
+  /*
+   * Return the number of values.
+   */
+  size_t size() const;
+ protected:
+  void Unstack(const LoDTensor &source, bool data_shared) const;
+ private:
+  mutable std::vector<LoDTensor> values_;
+};  // class TensorArray
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -194,7 +194,7 @@ public:
 REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif

--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
--- a/paddle/memory/.clang-format
+++ b/paddle/memory/.clang-format
--- a/paddle/memory/.clang-format
+++ b/paddle/memory/.clang-format
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
--- a/paddle/operators/concat_op.cu
+++ b/paddle/operators/concat_op.cu
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu
+++ b/paddle/operators/pool_op.cu
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/pybind/.clang-format
+++ b/paddle/pybind/.clang-format
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/exception.cc
+++ b/paddle/pybind/exception.cc
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/string/.clang-format
+++ b/paddle/string/.clang-format
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_add_op.py
+++ b/python/paddle/v2/framework/tests/test_add_op.py
--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/framework/tests/test_exception.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_split_op.py
+++ b/python/paddle/v2/framework/tests/test_split_op.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py