diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 51c3b918cc4ef4cf6c8052ccc14028a872309fcf..c1c93e17fd82ea048ba27b127b1527d9a8c9da41 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,11 +49,12 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
+
     FIND_PACKAGE(CUDA REQUIRED)
 
     if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..cf146dc088e3905a751ff55c26fd82ef0ba02c89 100644
--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
     trainer_config_helpers/optimizers.rst
     trainer_config_helpers/data_sources.rst
     trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
     trainer_config_helpers/poolings.rst
     trainer_config_helpers/networks.rst
     trainer_config_helpers/evaluators.rst
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c94627a72806fa2eca77c79da24f7f3ca18f0259..d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
     :noindex:
 
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
diff --git a/doc/design/block.md b/doc/design/block.md
index be8800122035984df281692fc40009c397565046..4d5dd4ba95a686d18b2339c69f0316c340681909 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:
 
 ```c++
+namespace pd = paddle;
+
 int x = 10;
-int y = 20;
-int out;
+int y = 1;
+int z = 10;
 bool cond = false;
+int o1, o2;
 if (cond) {
   int z = x + y;
-  out = softmax(z);
+  o1 = z;
+  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
-  out = z;
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
 }
+
 ```
 
 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd
 
-x = var(10)
-y = var(20)
-cond = var(false)
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
-    z = operator.add(x, y)
-    ie.set_output(true, 0, operator.softmax(z))
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
 with ie.false_block():
-    x = ie.inputs(false, 0)
-    z = layer.fc(x)
-    ie.set_output(true, 0, operator.softmax(z))
-out = b(cond)
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
 ```
 
-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
 
 A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
 
+
 ### Blocks with `for` and `RNNOp`
 
 The following RNN model from the [RNN design doc](./rnn.md)
 
 ```python
-x = sequence([10, 20, 30])
-m = var(0)
-W = tensor()
-U = tensor()
-
-rnn = create_rnn(inputs=[input])
-with rnn.stepnet() as net:
-  x = net.set_inputs(0)
-  h = net.add_memory(init=m)
-  fc_out = pd.matmul(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
-  sum = pd.add_two(fc_out, hidden_out)
-  act = pd.sigmoid(sum)
-  h.update(act)                       # update memory with act
-  net.set_outputs(0, act, hidden_out) # two outputs
-
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  hh = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, hh)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
 o1, o2 = rnn()
-print o1, o2
 ```
-
 has its equivalent C++ program as follows
 
 ```c++
 int* x = {10, 20, 30};
-int m = 0;
-int W = some_value();
-int U = some_other_value();
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
 
 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
   int x = x[i-1];
   if (i == 1) mem[0] = m;
-  int fc_out = W * x;
-  int hidden_out = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
   int act = sigmoid(sum);
   mem[i] = act;
   o1[i] = act;
   o2[i] = hidden_out;
 }
-
-print_array(o1);
-print_array(o2);
 ```
 
-
 ## Compilation and Execution
 
 Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
@@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 
 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
-    x = net.set_inputs(a)
+with rnn.stepnet()
+    x = a.as_step_input()
     # reuse fc's parameter
     fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)
 
 out = rnn()
 ```
diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md
index 954a19c0733358c235eae3cffe134c23dac94c95..26d140f06db4ecefa86be015eaa731ffddc6910c 100644
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
@@ -1,41 +1,51 @@
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator
 
-```python
-import paddle as pd
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
 
-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-
-with b.false_block():
-    x = b.inputs(0)
-    z = layer.fc(x)
-    b.set_output(0, operator.softmax(z))
-
-out = b(cond)
-```
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
 
-If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
 
-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
-
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
 
-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
diff --git a/doc/design/program.md b/doc/design/program.md
index fb8f86ac07af403c9fee015f2a3adbfaa3c6d631..bd2456787c4e336d357a65255a8274a7c9e465cc 100644
--- a/doc/design/program.md
+++ b/doc/design/program.md
@@ -1,8 +1,10 @@
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs
 
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
 
-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
 
 ```python
 x = layer.data("images")
@@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```
 
-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
 
-```protobuf
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
 }
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
 
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
 message BlockDesc {
   required int32 parent = 1;
   repeated VarDesc vars = 2;
   repeated OpDesc ops = 3;
 }
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
 
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
 message OpDesc {
   AttrDesc attrs = 1;
   ...
 }
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
 
+```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;
 
-  // index into ProgramDesc::blocks when type==BLOCK
-  optional int32 block = 2;
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
   ...
 }
 ```
 
-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
-
-A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
+## InferShape
 
 With this design, the InferShape function should take the following parameters:
 
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..6213da65c8c5931bc16e42574b8628b676424873
--- /dev/null
+++ b/doc/design/python_api.md
@@ -0,0 +1,216 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.proto = core.NewBlock(program.proto)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
+        core.infer_shape(self.proto, inputs, outputs)
+
+    def type(self):
+        return self.proto.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Functions
+
+A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
+
+### Data Layer
+
+```python
+def data_layer(name, type, column_name):
+    block = the_current_program.glolal_block()
+    var = block.create_global_var(
+            name=name,
+            shape=[None] + type.dims(),
+            dtype=type.dtype)
+    block.prepend_operator(block,
+                           type="Feed",
+                           inputs = None,
+                           outputs = [var],
+                           {column_name: column_name})
+    return var
+```
+
+The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
+
+### FC Layer
+
+```python
+def fc_layer(input, size, ...):
+    block = program.current_block()
+    w = block.create_parameter(...)
+    b = block.create_parameter(...)
+    out = block.create_var()
+    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
+    out.writer = op
+    return out
+```
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8
--- /dev/null
+++ b/doc/design/refactor/session.md
@@ -0,0 +1,180 @@
+# Design Doc: Session
+
+## Abstract
+
+The *session* object encapsulates the environment in which the
+computation graph is executed.
+
+We will have the *local* session and *remote* session, they offer the
+same [interface](#interface). The local session encapsulates the local
+runtime environment and the remote session encapsulates the cluster
+runtime environment.
+
+The local runtime environment contains:
+
+1. computation devices (i.e., CPU, GPU) handles, and
+1. the [scope](../scope.md) which holds all variables.
+
+The remote runtime environment contains:
+
+1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
+   and
+1. the distributed [scope](../scope.md) in a cluster which holds all
+   variables.
+
+The user can create a remote session on Paddle Cloud and evaluate the
+computation graph with it. In this way, the user can control the
+remote computation resource in a cluster from his local computer.
+
+
+## Background
+
+The current design has an implicit global session in which
+`paddle.eval()` is executed. The pain point is:
+
+Since the user is not able to explicitly switch between runtime
+environments, the user cannot run a topology in two independent
+environments.
+
+For example, in reinforcement learning, the user may want to have a
+stale model for inference and a fresh model for training, and only
+replace the stale model with the fresh model periodically.
+
+Furthermore, we have no concept that encapsulates a remote environment
+that executes a computation graph.
+
+We need the session object to address above issues.
+
+
+## Session
+
+A session is an object that owns the runtime environment. All
+computations are executed through `session.eval()`.
+
+
+### Interface
+
+```python
+eval(
+    targets,
+    feed_dict=None,
+)
+```
+
+Evaluates the target Operations or Variables in `targets`.
+
+- *targets*: the evaluation targets. Can be a single Operation or
+  Variable, or a list with the Operations or Variables as
+  elements. The value returned by `eval()` has the same shape as the
+  `target` argument.
+
+  The PaddlePaddle program is represented by
+  the [ProgramDesc](../design/program.md), `eval()` will infer the
+  ProgramDesc from the given targets and run the PaddlePaddle
+  program. Please
+  see
+  [this graph](./distributed_architecture.md#local-training-architecture) for
+  the detailed illustration for the local session
+  and
+  [this graph](./distributed_architecture.md#distributed-training-architecture) for
+  the detailed illustration for the remote session.
+
+- *feed_dict*: a dictionary that contains the tensors which override
+  the edges of the computation graph.
+
+  feed_dict not only can provide the input data, it can override any
+  OP's input as well:
+
+  ```python
+  a = pd.constant(2.0, name="a")
+  b = pd.variable(name="b")
+  c = pd.mul(a,b)
+  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
+  ```
+
+```python
+close()
+```
+
+Closes the session and releases the scope that the session owns.
+
+
+### Create a Local Session
+
+```python
+session(
+    devices=None
+)
+```
+
+Creates a new session. One session owns one global scope, so creating
+multiple sessions will create different scopes.
+
+- *devices*: a single `string` or a list of `string` of device names,
+  the corresponding devices will be the computation devices for
+  `eval()`. If not specified, all available devices (e.g., all GPUs)
+  will be used. The user doesn't need to specify the CPU device since
+  it will be always used. Multiple sessions can use the same device.
+
+
+#### Example
+
+```Python
+a = paddle.constant(1.0)
+b = paddle.constant(2.0)
+c = a + b
+sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
+sess.eval(c)
+sess.close()
+```
+
+### Create a Remote Session
+
+```python
+create_cloud_job(
+    name,
+    num_trainer,
+    mem_per_trainer,
+    gpu_per_trainer,
+    cpu_per_trainer,
+    num_ps,
+    mem_per_ps,
+    cpu_per_ps,
+)
+```
+
+Creates a Paddle Cloud job. Fails if the job name exists.
+
+```python
+get_cloud_job(
+    name
+)
+```
+
+Gets a Paddle Cloud job.
+
+```python
+remote_session(
+    job
+)
+```
+
+- *job*: the Paddle Cloud job.
+
+#### Example
+
+```Python
+reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
+image = reader.column(0)
+label = reader.column(1)
+fc1 = paddle.op.fc(image, size=256, act="sigmoid")
+fc2 = paddle.op.fc(fc1, size=10, act="softmax")
+cost = paddle.op.cross_entropy(fc2, label)
+opt = paddle.optimizer.sgd(cost)
+
+job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
+sess = paddle.remote_ession(job)
+for i in range(1000):
+    sess.eval(opt)
+sess.close()
+```
diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md
index a07675b3e0494e189321cb638599bdd6ce31c0b4..629422e7743af666b42fd69fbff442ce15bef596 100644
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
@@ -17,7 +17,7 @@ The goals of refactoring include:
 
 1. A graph is composed of *variables* and *operators*.
 
-1. The description of graphs must be capable of being serialized/deserialized, so that
+1. The description of graphs must be capable of being serialized/deserialized, so that:
 
    1. It can to be sent to the cloud for distributed execution, and
    1. It can be sent to clients for mobile or enterprise deployment.
@@ -137,19 +137,18 @@ Compile Time -> IR -> Runtime
 * `Eigen::Tensor` contains basic math and element-wise functions.
     * Note that `Eigen::Tensor` has broadcast implementation.
     * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::tranform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized elementwise kernels.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
     * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
     * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
 # Operator Registration
 
-## Why registration is necessary?
+## Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
 
 ## How is registration implemented?
-
 Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
 
 ---
@@ -170,7 +169,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 # Related Concepts
 
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
 
 ### Register Macros
 ```cpp
@@ -200,7 +199,7 @@ Make sure the registration process is executed and linked.
 ---
 # Backward Module (2/2)
 ### Build Backward Network
-- **Input**: graph of forwarding operators
+- **Input**: graph of forward operators
 - **Output**: graph of backward operators
 - **Corner cases in construction**
 	- Shared Variables => insert an `Add` operator to combine gradients
@@ -224,7 +223,7 @@ Make sure the registration process is executed and linked.
 
 ---
 # Block (in design)
-## the difference with original RNNOp
+## the difference between original RNNOp and Block
 - As an operator is more intuitive than `RNNOp`,
 - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
 - Fits the compile-time/ runtime separation design paradigm.
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cf8a59446d244bb3a388b87b14273d9096c839a
--- /dev/null
+++ b/doc/design/register_grad_op.md
@@ -0,0 +1,90 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+
+However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+
+More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+
+## Current Implementation
+
+OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
+
+The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md
index a0419ec002159893b035fae1300fce489e68936a..8378e97bf7cfaae54c36b1b92e202b16e4fe1e28 100644
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@@ -1,39 +1,250 @@
 # Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+
+ private:
+  vector<LoDTensor> values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
 TensorArray as a new concept is borrowed from TensorFlow, 
 it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
 
 This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
-such as `RecurrentGradientMachine`.
+such as `recurrent_op`, `RecurrentGradientMachine`.
 
 In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
 `TensorArray` is used to segment inputs and store states in all time steps.
 By providing some methods similar to a C++ array,
-the definition of some state-based dynamic models such as RNN could be more natural and highly flexible.
-
-## Dynamic-Related Methods
-Some basic methods should be proposed as follows:
-
-### stack()
-Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`.
-### unstack(axis=0)
-Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
-### concat()
-Return the values in the `TensorArray` as a concatenated Tensor.
-### write(index, value, data_shared=true)
-Write value into index of the TensorArray.
-### read(index)
-Read the value at location `index` in the `TensorArray`.
-### size()
-Return the number of values.
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = NewVar(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+
+
+    def size(self, output):
+        '''
+        Return the number of values.
+
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
 
 ## LoDTensor-related Supports
-The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, 
-because each step of RNN could only take a tensor-represented batch of data as input, 
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input, 
 some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
 
-Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`.
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
 
-With these two methods, a variant-sentence-RNN can be implemented like
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
 
 ```c++
 // input is the varient-length data
@@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map);
 ```
 the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
 the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
-
-
-some details are as follows.
-
-### unpack(level, sort_by_length)
-Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length.
-
-Returns:
-
-- a new `TensorArray`, whose values are LodTensors and represents batches of data.
-- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor
-### pack(level, indices_map)
-Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`.
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 264b998f50df016da0741d97d4b26f759ee90900..c823d7e9fcd63dd7719ac1403952b03c2d2f03c0 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
 - 在 `.cu`文件中注册GPU Kernel。
@@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
             'Y': np.random.random((84, 100)).astype("float32")
         }
 
-    def test_cpu_gpu_compare(self):
-        self.compare_grad(self.op, self.inputs)
-
-    def test_normal(self):
+    def test_check_grad_normal(self):
         # mul op will enlarge the relative error
-        self.check_grad(
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 ```
 
 下面解释代码中一些关键的地方:
 
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
-- 调用`compare_grad`函数对比CPU、GPU计算结果。
-- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
-  - 第一个参数`self.op` : 前向Op。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
-- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
 ### 编译和执行单元测试
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index bad1dbc1de9cc5bd11914fddf397857f0bda7976..1e88e1f5b4df710f1b69f0305d8d8a2921c4249a 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
 - Registering GPU Kernel in `.cu` files
@@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
             'Y': np.random.random((84, 100)).astype("float32")
         }
 
-    def test_cpu_gpu_compare(self):
-        self.compare_grad(self.op, self.inputs)
-
-    def test_normal(self):
+    def test_check_grad_normal(self):
         # mul op will enlarge the relative error
-        self.check_grad(
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 ```
 
 Some key points in the code above include:
 
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
-- `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
-- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The second variable `"Out"` points to the network's final output target `Out`.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
 
 ### Compiling and Running
 
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index d369df5d4e04b4a8d822db0e72a8051150868ce6..11bd05c09d1ecbbcec6b6596c16416c26635a072 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d898ebe2612d749ca261d35139d1cd45bd355eef..4547afaf1dc9af8bc7909a684db766fdd7b159c0 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
   paddle::real* buf = ptr->mat->getRowBuf(rowID);
   size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
   std::copy(rowArray, rowArray + width, buf);
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8a5d8532bb32db917b893f7f59039e08d85c8c34..3e0e0f59038daa33cae1952ffbe5fc0bb1870485 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,14 +22,12 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc)
-cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -43,3 +41,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+
+cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
+cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index c7559cefb6415ee141f32e4357459653564cd2ac..d13530e3408a54c7ecab87c3bd9e6288e342f9af 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -21,20 +21,12 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
-// The order should be as same as framework.proto
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<bool>, BlockDesc*>
-    Attribute;
-
-typedef std::unordered_map<std::string, Attribute> AttributeMap;
-
 ProgramDesc& GetProgramDesc();
 
 template <typename T>
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0ec18de5b8a0e7cebdb91c30d2b45596b02dfa51..c970e01dd19d80e9a47f315a05a920ba15585c90 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,10 +13,13 @@
    limitations under the License. */
 
 #include "paddle/framework/backward.h"
+#include "paddle/operators/net_op.h"
 
+#include <deque>
 #include <list>
 #include <memory>
 
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@@ -24,6 +27,35 @@
 namespace paddle {
 namespace framework {
 
+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op) {
+  OpDescBind op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc);
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
+
 template <typename Map, typename T>
 static void ForEachVarName(const Map& names, T callback) {
   for (auto& name : names) {
@@ -141,9 +173,26 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         net->ops_[op_offset]->Rename(name, dup_outputs.back());
       }
       // collect all the offset to append `add` op for each alias
-      insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
-                                               {{"Out", {name}}}, {})});
+      //
+      // one variable is shared between multiple operators.
+      // insert add operator one by one, then add it to output
+      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+           ++output_idx) {
+        auto insert_add_x = dup_outputs[output_idx];
+        auto insert_add_y = dup_outputs[output_idx + 1];
+        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+        // first add op inserted
+        if (output_idx == dup_outputs.size() - 2) {
+          insert_add_out = name;
+        }
+        if (output_idx != 0) {
+          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
+        }
+        insert_position.push_back(
+            {dup_op.back(),
+             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
+                                  {{"Out", {insert_add_out}}}, {})});
+      }
     }
 
     // make sure the inserted `add` ops follow the BFS order.
@@ -154,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       net->InsertOp(pos.first + 1, std::move(pos.second));
     }
   } else {
-    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
 
     ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                           const std::string& grad_input) {
@@ -182,7 +231,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
 
     // process recurrent gradient op as a special operator.
     if (forwardOp.Type() == "recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or
       // this will result in infinite loop.
       const auto& rnnop =
           *static_cast<const operators::RecurrentOp*>(&forwardOp);
@@ -222,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward(
   return BackwardRecursive(forwardOp, no_grad_names, uid);
 }
 
+// ====================================  //
+
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
+    const std::unique_ptr<OpDescBind>& op_desc,
+    std::unordered_set<std::string>& no_grad_vars) {
+  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculat.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, no_grad_vars)) {
+    return grad_op_descs;  // empty vector
+  }
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+  if (AllGradInSet(outputs, no_grad_vars)) {
+    for (const std::string& name : inputs) {
+      no_grad_vars.insert(GradVarName(name));
+    }
+    return grad_op_descs;  // empty vector
+  }
+
+  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
+
+  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars.count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
+            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+    for (const std::string& out_name : desc->OutputArgumentNames()) {
+      if (no_grad_vars.count(out_name)) {
+        desc->Rename(out_name, kEmptyVarName);
+      }
+    }
+  }
+
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+    ProgramDescBind& program_desc, int block_idx,
+    std::unordered_set<std::string>& no_grad_vars) {
+  BlockDescBind* cur_block = program_desc.Block(block_idx);
+  std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    std::vector<std::unique_ptr<OpDescBind>> op_grads =
+        MakeOpGrad(*it, no_grad_vars);
+
+    if ((*it)->Type() == "recurrent") {
+      PADDLE_ENFORCE_EQ(
+          op_grads.size(), size_t(1),
+          "rnn_op's gradient process should contain only one op.");
+      int step_block_idx = (*it)->GetBlockAttr("stop_block");
+      auto backward_block_op_descs =
+          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
+      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
+      for (auto& ptr : backward_block_op_descs) {
+        backward_block->ops_.push_back(std::move(ptr));
+      }
+      op_grads[0]->SetBlockAttr("step_block", *backward_block);
+    }
+
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(
+        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
+        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+  }
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        sum_op_inputs.emplace_back(new_name);
+      }
+      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
+          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+  pending_sum_ops.sort(
+      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
+         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
+        return a.first > b.first;
+      });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+  return backward_descs;
+}
+
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+  const int root_block_idx = 0;
+  auto backward_op_descs =
+      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
+  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
+  for (auto& ptr : backward_op_descs) {
+    forw_op_descs.push_back(std::move(ptr));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index 1ecf69881b3126c2904920b9f4b77bfcccc9cf86..7ffe4c28103f9d6a9f179422d1beb86106ef786e 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -13,8 +13,11 @@
    limitations under the License. */
 
 #pragma once
+
 #include <unordered_set>
-#include "operator.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
 namespace paddle {
 namespace framework {
 
@@ -23,5 +26,9 @@ namespace framework {
 extern std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars);
+
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 6932f5b989a3e21ebc44ec4fec9f5223f2547d7a..30225a4a99d993c4f12a5e0d276bda18acbc360e 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -15,30 +15,42 @@
 #include "paddle/framework/backward.h"
 
 #include <gtest/gtest.h>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
 
-using OperatorBase = framework::OperatorBase;
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using Scope = framework::Scope;
 using DeviceContext = platform::DeviceContext;
 
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
  public:
   RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add").NotInGradient();
-    AddInput("b", "Bias of Add").NotInGradient();
-    AddOutput("Out", "Out of Add").NotInGradient();
+    AddInput("X", "Input X of Add");
+    AddInput("b", "Bias of Add");
+    AddOutput("Out", "Out of Add");
     AddComment("Add Op");
   }
 };
 
+class RowWiseAddGradMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<OpDescBind> Apply() const override {
+    auto grad_op = new OpDescBind();
+    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
+    grad_op->SetType("rowwise_add_grad");
+    return std::unique_ptr<OpDescBind>(grad_op);
+  }
+};
+
 class MulOpMaker : public OpProtoAndCheckerMaker {
  public:
   MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -133,42 +145,46 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x").AsDuplicable();
-    AddOutput("Out", "out");
+    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of sum operator.");
     AddComment("");
   }
 };
+
+class MultInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("H", "h");
+    AddOutput("Y", "y");
+    AddOutput("Z", "z");
+    AddComment("");
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
 
 namespace f = paddle::framework;
 namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
-REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
-            f::NOP);
+REGISTER_OPERATOR(rowwise_add, f::NOP, f::RowWiseAddOpMaker,
+                  f::RowWiseAddGradMaker);
+REGISTER_OPERATOR(rowwise_add_grad, f::NOP);
 REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
 REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
 REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
             f::NOP);
-
-TEST(Backward, simple_op_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::OpRegistry::CreateGradOp(*fwd);
-  ASSERT_EQ(1UL, gop->Inputs().size());
-  ASSERT_EQ("rowwise_add_grad", gop->Type());
-  ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
-  ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
-}
+REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP);
 
 TEST(Backward, simple_op_not_need_grad) {
   auto fwd = f::OpRegistry::CreateOp(
@@ -283,18 +299,7 @@ TEST(Backward, net_shared_weight) {
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("add", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_register_grad_not_for_network) {
-  auto fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_out"}},
-                               {"add_result", {"add_out"}},
-                               {"Out", {"out1"}}},
-                              {{"temporary_index", std::vector<int>{0, 1}}});
-
-  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
 }
 
 TEST(Backward, op_all_input_are_not_need) {
@@ -399,3 +404,293 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
   EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
 }
+
+// =================================== //
+
+f::ProgramDesc *GetNewProgramDesc() {
+  auto *program_desc = new f::ProgramDesc();
+  auto *root_block = program_desc->add_blocks();
+  root_block->set_idx(0);
+  root_block->set_parent_idx(-1);
+  return program_desc;
+}
+
+TEST(Backward, simple_single_op) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+  op->SetType("rowwise_add");
+  op->SetInput("X", {"x"});
+  op->SetInput("b", {"b"});
+  op->SetOutput("Out", {"out"});
+
+  AppendBackward(program, {});
+
+  ASSERT_EQ(block->AllOps().size(), 2UL);
+  f::OpDescBind *grad_op = block->AllOps()[1];
+  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b")}));
+}
+
+TEST(Backward, simple_mult_op) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  AppendBackward(program, {});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDescBind *grad_op2 = block->AllOps()[4];
+  EXPECT_EQ(grad_op2->Type(), "mul_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+}
+
+TEST(Backward, intermedia_var_no_grad) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"x2"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  f::OpDescBind *op4 = block->AppendOp();
+  op4->SetType("mul");
+  op4->SetInput("X", {"out1"});
+  op4->SetInput("Y", {"out3"});
+  op4->SetOutput("Out", {"out4"});
+
+  AppendBackward(program, {"out3"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  EXPECT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out4")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::kEmptyVarName}));
+}
+
+TEST(Backward, var_no_grad) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("mult_in_out");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("H", {"h1"});
+  op1->SetOutput("Y", {"y1"});
+  op1->SetOutput("Z", {"z1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mult_in_out");
+  op2->SetInput("X", {"y1"});
+  op2->SetInput("H", {"z1"});
+  op2->SetOutput("Y", {"y2"});
+  op2->SetOutput("Z", {"z2"});
+
+  AppendBackward(program, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 5UL);
+  f::OpDescBind *grad_op2 = block->AllOps()[2];
+  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
+            std::vector<std::string>({f::GradVarName("z2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::kEmptyVarName}));
+
+  f::OpDescBind *fill_zero_op = block->AllOps()[3];
+  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
+  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
+  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(fill_zero_op->Output("Y"),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+
+  f::OpDescBind *grad_op1 = block->AllOps()[4];
+  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
+  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
+  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::GradVarName("h1")}));
+}
+
+TEST(Backward, shared_var) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out1"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  AppendBackward(program, {});
+
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  ASSERT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDescBind *sum_op = block->AllOps()[5];
+  ASSERT_EQ(sum_op->Type(), "sum");
+  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
+  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(sum_op->Input("X"),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
+                                      f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(sum_op->Output("Out"),
+            std::vector<std::string>({f::GradVarName("out1")}));
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+}
\ No newline at end of file
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9570aedfdda332b797a8f348e0f6cf81bb2aee2f..01f50e1393606044fb20d5f782fadede46b744e3 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -34,6 +34,10 @@ VarDescBind *BlockDescBind::Var(const std::string &name) const {
   return it->second.get();
 }
 
+bool BlockDescBind::HasVar(const std::string &name) const {
+  return vars_.find(name) != vars_.end();
+}
+
 std::vector<VarDescBind *> BlockDescBind::AllVars() const {
   std::vector<VarDescBind *> res;
   for (const auto &p : vars_) {
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 1a1135bab44cd27bb7d784c3b486188aa40635e4..2de270f60ec2ae981335f1adb204cfc3bf78c622 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -31,12 +32,17 @@ class ProgramDescBind;
 
 class BlockDescBind {
  public:
+  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+      ProgramDescBind &program_desc, int block_idx,
+      std::unordered_set<std::string> &no_grad_vars);
+
+  friend void AppendBackward(
+      ProgramDescBind &program_desc,
+      const std::unordered_set<std::string> &no_grad_vars);
+
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
-  BlockDescBind(const BlockDescBind &o) = delete;
-  BlockDescBind &operator=(const BlockDescBind &o) = delete;
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -45,6 +51,8 @@ class BlockDescBind {
 
   VarDescBind *Var(const std::string &name_bytes) const;
 
+  bool HasVar(const std::string &var_name) const;
+
   std::vector<VarDescBind *> AllVars() const;
 
   BlockDescBind *ParentBlock() const;
@@ -66,6 +74,8 @@ class BlockDescBind {
 
   std::deque<std::unique_ptr<OpDescBind>> ops_;
   std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+
+  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..daa474e8c5a223589018720da29a5c3363b5934d
--- /dev/null
+++ b/paddle/framework/details/op_registry.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+enum OpInfoFillType {
+  kOperator = 0,
+  kOpProtoAndCheckerMaker = 1,
+  kGradOpDescMaker = 2
+};
+
+template <typename T>
+struct OpInfoFillTypeID {
+  static constexpr OpInfoFillType ID() {
+    return std::is_base_of<OperatorBase, T>::value
+               ? kOperator
+               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
+                      ? kOpProtoAndCheckerMaker
+                      : (std::is_base_of<GradOpDescMakerBase, T>::value
+                             ? kGradOpDescMaker
+                             : static_cast<OpInfoFillType>(-1)));
+  }
+};
+
+template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
+struct OpInfoFiller;
+
+template <size_t I, bool at_end, typename... ARGS>
+class OperatorRegistrarRecursive;
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, false, ARGS...> {
+ public:
+  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
+    OpInfoFiller<T> fill;
+    fill(op_type, info);
+    constexpr auto size = sizeof...(ARGS);
+    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
+                                                                  info);
+    (void)(reg);
+  }
+};
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, true, ARGS...> {
+ public:
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOperator> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
+                        const VariableNameMap& outputs,
+                        const AttributeMap& attrs) {
+      return new T(type, inputs, outputs, attrs);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->proto_ = new OpProto;
+    info->checker_ = new OpAttrChecker();
+    auto maker = T(info->proto_, info->checker_);
+    maker.Validate();
+    info->proto_->set_type(op_type);
+    PADDLE_ENFORCE(
+        info->proto_->IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, info->proto_->InitializationErrorString());
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kGradOpDescMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
+      T maker(fwd_op);
+      return maker();
+    };
+  }
+};
+}  // namespace details
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 951c7afbc14e2d9119169c1351d38ff0b67bdc5b..ac2827e54773f811eb855c092e3c0ed2fab06dd3 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -66,7 +66,6 @@ message OpProto {
 
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
-    optional bool not_in_gradient = 5 [ default = false ];
   }
 
   // AttrProto describes the C++ type Attribute.
@@ -106,6 +105,7 @@ message LoDTensorDesc {
 message VarDesc {
   required string name = 1;
   optional LoDTensorDesc lod_tensor = 2;
+  optional bool persistable = 3 [ default = false ];
 }
 
 message BlockDesc {
@@ -115,4 +115,7 @@ message BlockDesc {
   repeated OpDesc ops = 4;
 }
 
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
 message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
deleted file mode 100644
index 3661ce41beba1328d1b1cdd9f0f913e693af9cff..0000000000000000000000000000000000000000
--- a/paddle/framework/grad_op_builder.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
-express or implied. See the License for the specific language governing
-permissions and limitations under the License. */
-
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-enum class OpArgType { IN, OUT };
-
-static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, VariableNameMap* vars) {
-  const auto& src_inout =
-      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
-  auto& dst_inout = *vars;
-  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
-    for (auto& var_name : src_inout.at(src_name)) {
-      std::string s = is_grad ? GradVarName(var_name) : var_name;
-      dst_inout[dst_name].emplace_back(s);
-    }
-  }
-}
-
-OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
-  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
-  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
-  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-
-  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-}
-
-static void TransOpDescArg(const OpDescBind* src_op, const OpArgType& src_type,
-                           bool is_grad, OpDescBind* dst_op,
-                           const OpArgType& dst_type) {
-  PADDLE_ENFORCE(dst_op != nullptr,
-                 "Protobuf desc of gradient op must be initialized first.");
-  const auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::vector<std::string> vars = src_type == OpArgType::IN
-                                        ? src_op->Input(src_name)
-                                        : src_op->Output(src_name);
-    if (is_grad) {
-      for (std::string& var : vars) {
-        var = GradVarName(var);
-      }
-    }
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_type == OpArgType::IN ? dst_op->SetInput(dst_name, vars)
-                              : dst_op->SetOutput(dst_name, vars);
-  }
-}
-
-void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op) {
-  auto& info = OpInfoMap::Instance().Get(forw_op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-
-  grad_op->SetType(info.grad_op_type_);
-
-  TransOpDescArg(forw_op, OpArgType::IN, false, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::OUT, false, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::OUT, true, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::IN, true, grad_op, OpArgType::OUT);
-
-  grad_op->SetAttrMap(forw_op->GetAttrMap());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
deleted file mode 100644
index d09892f81bea34415d454b017258fd2a0d4575db..0000000000000000000000000000000000000000
--- a/paddle/framework/grad_op_builder_test.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include "paddle/framework/grad_op_builder.h"
-#include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-USE_OP(add);
-
-namespace paddle {
-namespace framework {
-
-class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable();
-    AddInput("In3", "another single input");
-    AddOutput("Out1", "a single output");
-    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
-    AddComment("test op with multiple inputs and outputs");
-  }
-};
-
-class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
- public:
-  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
-    AddInput("In3_mult", "another multiple input").AsDuplicable();
-    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
-    AddOutput("Out2", "a single output").NotInGradient();
-    AddComment("op with inputs and outputs ignored in gradient calculating");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-
-TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_add_op =
-      f::OpRegistry::CreateGradOp(*add_op);
-  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
-  EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
-  EXPECT_EQ(grad_add_op->Input("X"), "x");
-  EXPECT_EQ(grad_add_op->Input("Y"), "y");
-  EXPECT_EQ(grad_add_op->Input("Out"), "out");
-  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
-}
-
-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
-
-TEST(GradOpBuilder, MutiInOut) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "mult_io", {{"In1", {"in1"}},
-                  {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
-                  {"In3", {"in3"}}},
-      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
-  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
-            f::GradVarName("out1"));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
-}
-
-TEST(GradOpBuilder, IOIgnoredInGradient) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "io_ignored", {{"In1", {"in1"}},
-                     {"In2_mult", {"in2_1", "in2_2"}},
-                     {"In3_mult", {"in3_1", "in3_2"}}},
-      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
-            f::GradVarName("out2"));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-}
-
-TEST(GradOpDescBuilder, MutiInOut) {
-  f::OpDescBind *forw_op = new f::OpDescBind();
-  forw_op->SetType("mult_io");
-  forw_op->SetInput("In1", {"in1"});
-  forw_op->SetInput("In2_mult", {"in2_1", "in2_2", "in2_3"});
-  forw_op->SetInput("In3", {"in3"});
-  forw_op->SetOutput("Out1", {"out1"});
-  forw_op->SetOutput("Out2_mult", {"out2_1", "out2_2"});
-
-  f::OpDescBind *grad_op = new f::OpDescBind();
-  f::CompleteGradOpDesc(forw_op, grad_op);
-
-  EXPECT_EQ(grad_op->Type(), "mult_io_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
-  EXPECT_EQ(grad_op->Input("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_op->Input("In3"), std::vector<std::string>({"in3"}));
-  EXPECT_EQ(grad_op->Input("Out1"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op->Input("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-
-  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
-            std::vector<std::string>({f::GradVarName("in1")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In3")),
-            std::vector<std::string>({f::GradVarName("in3")}));
-  delete forw_op;
-  delete grad_op;
-}
-
-TEST(GradOpDescBuilder, IOIgnoredInGradient) {
-  f::OpDescBind *forw_op = new f::OpDescBind();
-  forw_op->SetType("io_ignored");
-  forw_op->SetInput("In1", {"in1"});
-  forw_op->SetInput("In2_mult", {"in2_1", "in2_2"});
-  forw_op->SetInput("In3_mult", {"in3_1", "in3_2"});
-  forw_op->SetOutput("Out1_mult", {"out1_1", "out1_2"});
-  forw_op->SetOutput("Out2", {"out2"});
-
-  f::OpDescBind *grad_op = new f::OpDescBind();
-  f::CompleteGradOpDesc(forw_op, grad_op);
-
-  EXPECT_EQ(grad_op->Type(), "io_ignored_grad");
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_op->InputNames().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
-  EXPECT_EQ(grad_op->Input("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_op->Input("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-
-  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
-            std::vector<std::string>({f::GradVarName("in1")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-  delete forw_op;
-  delete grad_op;
-}
\ No newline at end of file
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9ae6e22060850fe229998d3b651d08a5ca2033a
--- /dev/null
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
+
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+
+ protected:
+  static std::vector<std::string> ToGradNames(
+      const std::vector<std::string>& var_names) {
+    std::vector<std::string> ret_val;
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val), GradVarName);
+    return ret_val;
+  }
+
+  std::vector<std::string> InputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Input(name));
+  }
+
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Output(name));
+  }
+
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
+  }
+
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
+  }
+
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+
+ private:
+  const OpDescBind& fwd_op_;
+};
+
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
+
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+};
+
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const {
+    auto* grad = new OpDescBind();
+    grad->SetType(this->GradOpType());
+
+    for (auto& input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+    }
+
+    for (auto& output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+
+    grad->SetAttrMap(this->Attrs());
+
+    return std::unique_ptr<OpDescBind>(grad);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 49786a4a6635f1b39356dbf9633c4e7da443f04e..4db36ee76609ac6360fe2fc7b4a366e0284d1016 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <memory>
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/system/cuda/experimental/pinned_allocator.h>
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace framework {
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 template <typename T>
 using Vector = std::vector<T>;
 #else
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 07bbdf9416c432052b3222757a61ac4bfd70fe14..d147f1c4257eec14664301edab8d1fe2f128d2b0 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -1,147 +1,175 @@
 # Design Doc: LoD (Level-of-Detail) Tensor
 
-PaddlePaddle's RNN doesn't require that all instances have the same length.  To do so, we introduce an extension to Tensor, namely, LoD Tensor.
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-## Challenge of Variable-length Inputs
+|                       | TensorFlow | PaddlePaddle |
+|-----------------------|------------|--------------|
+| RNN                   | Support    | Support      |
+| recursive RNN         | Support    | Support      |
+| padding zeros         | Must       | No need      |
+| blob data type        | Tensor     | LoDTensor    |
 
-People usually represent a mini-batch by a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  So a transformation, T, of all images can be a matrix multiplication of the 10xOx32-dimensional tensor T and the 10x32x32 Tensor.
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
-Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector.  If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor.  However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
 
-## LoD as a Solution
+## The Challenge: Variable-length Sequences
 
-### Mini-Batch of variable-length sentences
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
 
-Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
 
 ```
-   3
 3   1 2
 ||| | ||
 ```
 
-Each `|` represents a D-dimensional word vectors.  The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
 
-### Mini-Batch of variable-length videos
+### A Mini-Batch of Videos
 
-This approach generalizes to the case where elements are not words, but higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively.  The underlying tensor is of size (3+1+2)x640x480.  The index information illustrates as:
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
 
 ```
-     3
 3     1  2
 口口口 口 口口
 ```
 
-where each `口` represents an image.
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
 
-### Mini-Batch of fixed-size images
+### A Mini-Batch of Images
 
-Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images.  The LoD Tensor representation is
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
 
 ```
-     M
 1 1 1 1     1
 口口口口 ... 口
 ```
 
-The many 1's on the second level seem duplicated.  For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
-
-### Design and summarization
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
 
-In summary, as long as that the essential elements (words  or images) have the same size, we can represent mini-batches by a LoD Tensor:
+```
+口口口口 ... 口
+```
 
-- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
-- The first dimension size L has an additonal property -- a LoD index as a nested vector:
+### Model Parameters
 
-  ```c++
-  typedef std::vector<std::<vector>> LoD;
-  ```
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
 
-- The LoD index is not necessary when there are only two levels and all elements of the second level have length 1.
 
-## Slicing of LoD Tensor
+## The LoD Tensor
 
-Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words.  This network requires that mini-batches represented by 3 level LoD Tensor, for example,
+Let us revisit above example of the 2-level LoD Tensor
 
 ```
-         3
 3           1  2
 3   2  4    1  2  3
 ||| || |||| |  || |||
 ```
 
-To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
 
-For example, the <2,1>-slice of above slice is
+We can save the LoD index in the above example
 
 ```
-2
-||
+3           1  2
+3   2  4    1  2  3
 ```
 
-and the <1,2>-slice of above example is
+in a not-full 2D matrix:
 
+```c++
+typedef std::vector<std::vector<int> > LoD;
 ```
-2
-2  3
-|| |||
-```
 
-Let's go on slicing this slice.  Its <1,1>-slice is
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
 
 ```
-1
-1
-|
+3 2 4 1 2 3
 ```
 
-### The Slicing Algorithm
+into offsets
 
-The algorithm, with over-simplified data structure, is defined as
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
 
-```c++
-typedef std::vector<std::vector<int>> LoD;
+so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
 
-struct LoDTensor {
-  LoD lod_;
-  float* tensor_;
-};
+Similarly, the lengths in the top level LoD
 
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+3 1 2
 ```
 
-Let us revisit the example above
+are transformed into offsets of elements/words as follows:
 
 ```
-         3
-3           1  2
-3   2  4    1  2  3
-||| || |||| |  || |||
+0 9     10  15
+  =     =   =
+  3+2+4 1+9 2+3+10
 ```
 
-Suppose that we want to retrieve the <1,2>-slice
+so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
+
+The complete offset representation is as follows:
 
 ```
-2
-2  3
-|| |||
+0           9 10       15
+0   3  5    9 10  12   15
+ ||| || |||| |  ||  |||
 ```
 
-we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+## Slicing of LoD Tensors
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
-To avoid the traversal of the LoD tree at slicing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+For example, the <2>-slice of above example is
 
 ```
-        0
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10      15
+10  12  15
+  || |||
 ```
 
-We don't really need the 0 on top, so the LoD Tensor could be
+and the <2,0>-slice of above slice is
 
 ```
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10  12
+  ||
 ```
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 0c12c55dc09f6aa064066b5c73bc5e985a57343f..02aa74a8420a5c685c88d7cb0b487284814b3690 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -18,6 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs) {
+  op_desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+}
+
 OpDesc *OpDescBind::Proto() {
   Sync();
   return &op_desc_;
@@ -31,11 +40,10 @@ const std::vector<std::string> &OpDescBind::Input(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputNames() const {
+std::vector<std::string> OpDescBind::InputArgumentNames() const {
   std::vector<std::string> retv;
-  retv.reserve(this->inputs_.size());
   for (auto &ipt : this->inputs_) {
-    retv.push_back(ipt.first);
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
   }
   return retv;
 }
@@ -54,11 +62,10 @@ const std::vector<std::string> &OpDescBind::Output(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputNames() const {
+std::vector<std::string> OpDescBind::OutputArgumentNames() const {
   std::vector<std::string> retv;
-  retv.reserve(this->outputs_.size());
   for (auto &ipt : this->outputs_) {
-    retv.push_back(ipt.first);
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
   }
   return retv;
 }
@@ -112,6 +119,42 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
   return attrs_;
 }
 
+void OpDescBind::Rename(const std::string &old_name,
+                        const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+  mutable OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
 void OpDescBind::Sync() {
   if (need_update_) {
     this->op_desc_.mutable_inputs()->Clear();
@@ -134,7 +177,8 @@ void OpDescBind::Sync() {
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
           static_cast<framework::AttrType>(attr.second.which() - 1));
-      boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
     }
 
     need_update_ = false;
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 0cf7d13971675eb825bcd0c7636896f0862d6ebb..b39808dad1de061e896936ec84169cd62e29856d 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
@@ -26,6 +27,11 @@ class BlockDescBind;
 
 class OpDescBind {
  public:
+  OpDescBind() {}
+
+  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs, const AttributeMap &attrs);
+
   OpDesc *Proto();
 
   std::string Type() const { return op_desc_.type(); }
@@ -34,14 +40,14 @@ class OpDescBind {
 
   const std::vector<std::string> &Input(const std::string &name) const;
 
-  std::vector<std::string> InputNames() const;
+  std::vector<std::string> InputArgumentNames() const;
 
   void SetInput(const std::string &param_name,
                 const std::vector<std::string> &args);
 
   const std::vector<std::string> &Output(const std::string &name) const;
 
-  std::vector<std::string> OutputNames() const;
+  std::vector<std::string> OutputArgumentNames() const;
 
   void SetOutput(const std::string &param_name,
                  const std::vector<std::string> &args);
@@ -60,49 +66,52 @@ class OpDescBind {
 
   void SetBlockAttr(const std::string &name, BlockDescBind &block);
 
-  // Only be used in C++
-  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
-
   Attribute GetAttr(const std::string &name) const;
 
   int GetBlockAttr(const std::string &name) const;
 
+  void Rename(const std::string &old_name, const std::string &new_name);
+
   // Only be used in C++
-  const std::unordered_map<std::string, Attribute> &GetAttrMap() const;
+  const AttributeMap &GetAttrMap() const;
 
- private:
-  struct SetAttrDescVisitor : public boost::static_visitor<void> {
-    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    mutable OpDesc::Attr *attr_;
-    void operator()(int v) const { attr_->set_i(v); }
-    void operator()(float v) const { attr_->set_f(v); }
-    void operator()(const std::string &v) const { attr_->set_s(v); }
-    void operator()(bool b) const { attr_->set_b(b); }
-
-    void operator()(const std::vector<int> &v) const {
-      VectorToRepeated(v, attr_->mutable_ints());
-    }
-    void operator()(const std::vector<float> &v) const {
-      VectorToRepeated(v, attr_->mutable_floats());
-    }
-    void operator()(const std::vector<std::string> &v) const {
-      VectorToRepeated(v, attr_->mutable_strings());
-    }
-    void operator()(const std::vector<bool> &v) const {
-      VectorToRepeated(v, attr_->mutable_bools());
-    }
-    void operator()(BlockDesc *desc) const {
-      attr_->set_block_idx(desc->idx());
-    }
-    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-  };
+  // Only be used in C++
+  void SetAttrMap(const AttributeMap &attr_map);
+
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
+
+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
 
   void Sync();
 
+  const VariableNameMap &Inputs() const { return inputs_; }
+
+  const VariableNameMap &Outputs() const { return outputs_; }
+
+ private:
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), std::back_inserter(ret_val),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
+
   OpDesc op_desc_;
-  std::unordered_map<std::string, std::vector<std::string>> inputs_;
-  std::unordered_map<std::string, std::vector<std::string>> outputs_;
-  std::unordered_map<std::string, Attribute> attrs_;
+  VariableNameMap inputs_;
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
 
   // need_update_ indicate there some local changes not be synchronized. If
   // local changes should be synchronized, need_update_ should be set to true.
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index b98d8f23a14cf6fbe787953ad16b5c9ab99222ad..c504f69e30bb899c183bd4281d2eadb50fd3b376 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,21 +19,18 @@
 #include <unordered_map>
 
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
-class OperatorBase;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
 struct OpInfo {
   OpCreator creator_;
-  std::string grad_op_type_;
-  OpProto* proto_;
-  OpAttrChecker* checker_;
+  GradOpMakerFN grad_op_maker_;
+  OpProto* proto_{nullptr};
+  OpAttrChecker* checker_{nullptr};
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
@@ -46,30 +43,25 @@ struct OpInfo {
     return *proto_;
   }
 
-  const OpAttrChecker& Checker() const {
-    PADDLE_ENFORCE_NOT_NULL(checker_,
-                            "Operator Checker has not been registered");
-    return *checker_;
-  }
-
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
                             "Operator Creator has not been registered");
     return creator_;
   }
 
-  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+
+  const OpAttrChecker* Checker() const { return checker_; }
 };
 
 class OpInfoMap {
  public:
   static OpInfoMap& Instance();
 
-  OpInfoMap(const OpInfoMap& o) = delete;
-  OpInfoMap(OpInfoMap&& o) = delete;
-  OpInfoMap& operator=(const OpInfoMap& o) = delete;
-  OpInfoMap& operator=(OpInfoMap&& o) = delete;
-
   bool Has(const std::string& op_type) const {
     return map_.find(op_type) != map_.end();
   }
@@ -105,6 +97,8 @@ class OpInfoMap {
  private:
   OpInfoMap() = default;
   std::unordered_map<std::string, const OpInfo> map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
 };
 
 }  // namespace framework
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index 4d55a37db9f0a3deac7b3489c8bc288ea41f4799..a134befd90a1eaeff6f6ea62f11412df63cdc394 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -44,11 +44,6 @@ class OpProtoAndCheckerMaker {
       var_->set_intermediate(true);
       return *this;
     }
-
-    VariableBuilder& NotInGradient() {
-      var_->set_not_in_gradient(true);
-      return *this;
-    }
   };
 
   VariableBuilder AddInput(const std::string& name, const std::string& comment);
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index b01e30f75371ca4aa63dae86ddfb966b1d4c7830..988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -48,4 +48,4 @@ TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index b0e85dd49f97da4a7f889fde0b5f060954947be8..66043f6e04fdb63b5d11a15c66abc84339e13c9a 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const std::string& type, const VariableNameMap& inputs,
     const VariableNameMap& outputs, AttributeMap attrs) {
   auto& info = OpInfoMap::Instance().Get(type);
-  info.Checker().Check(attrs);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
   auto op = info.Creator()(type, inputs, outputs, attrs);
   return std::unique_ptr<OperatorBase>(op);
 }
@@ -52,9 +54,15 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
-  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
-  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
+                  op_desc.GetAttrMap());
+}
+
+std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
+    const OpDescBind& op_desc) {
+  auto& info = OpInfoMap::Instance().Get(op_desc.Type());
+  return info.grad_op_maker_(op_desc);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 4db38badaea8ae22d9ad47951f4941f3bdeb401a..cce3605fd480c1d79a5969f6a4cb170ea4d879f2 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -21,49 +21,54 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/details/op_registry.h"
 #include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which,
+  // however, are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+template <typename... ARGS>
+struct OperatorRegistrar : public Registrar {
+  explicit OperatorRegistrar(const char* op_type) : op_type(op_type) {
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
+    static_assert(sizeof...(ARGS) != 0,
+                  "OperatorRegistrar should be invoked at least by OpClass");
+    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
+  }
+
+  const char* op_type;
+
+  OpInfo info;
+};
 
 class OpRegistry {
  public:
   template <typename OpType, typename ProtoMakerType, typename GradOpType>
   static void RegisterOp(const std::string& op_type,
                          const std::string& grad_op_type) {
-    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
-                   "'%s' is registered more than once.", op_type);
-    OpInfo op_info;
-    op_info.creator_ = [](
-        const std::string& type, const VariableNameMap& inputs,
-        const VariableNameMap& outputs, const AttributeMap& attrs) {
-      return new OpType(type, inputs, outputs, attrs);
-    };
-    op_info.grad_op_type_ = grad_op_type;
-    if (std::type_index(typeid(ProtoMakerType)) !=
-        std::type_index(typeid(NOPMaker))) {
-      op_info.proto_ = new OpProto;
-      op_info.checker_ = new OpAttrChecker;
-      auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
-      maker.Validate();
-      op_info.proto_->set_type(op_type);
-      PADDLE_ENFORCE(
-          op_info.proto_->IsInitialized(),
-          "Fail to initialize %s's OpProto, because %s is not initialized",
-          op_type, op_info.proto_->InitializationErrorString());
-    } else {
-      op_info.proto_ = nullptr;
-      op_info.checker_ = nullptr;
-    }
-    OpInfoMap::Instance().Insert(op_type, op_info);
+    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
+    reg.info.grad_op_type_ = grad_op_type;
     // register gradient op
     if (!grad_op_type.empty()) {
-      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
+      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
     }
   }
 
@@ -74,20 +79,10 @@ class OpRegistry {
 
   static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-};
+  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
+      const OpDescBind& op_desc);
 
-class Registrar {
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };
 
 template <typename OpType, typename ProtoMakerType, typename GradOpType>
@@ -145,33 +140,41 @@ class OpKernelRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
+  class _OpClass_##op_type##_ : public op_class {                      \
+   public:                                                             \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
+  };                                                                   \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
+  int TouchOpRegistrar_##op_type() {                                   \
+    __op_registrar_##op_type##__.Touch();                              \
+    return 0;                                                          \
+  }
+
 /**
  * Macro to register Operator.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
-                    grad_op_class)                                            \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
-      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
-  class _OpClass_##op_type##_ : public op_class {                             \
-   public:                                                                    \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                            \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                   \
-  };                                                                          \
-  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
-   public:                                                                    \
-    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
-    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
-  };                                                                          \
-  static ::paddle::framework::OpRegistrar<                                    \
-      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
-  int TouchOpRegistrar_##op_type() {                                          \
-    __op_registrar_##op_type##__.Touch();                                     \
-    return 0;                                                                 \
-  }
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,           \
+                    grad_op_class)                                             \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                              \
+  class _GradOpDescMaker_##grad_op_type##_                                     \
+      : public ::paddle::framework::DefaultGradOpDescMaker {                   \
+    using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
+                                                                               \
+   protected:                                                                  \
+    virtual std::string GradOpType() const { return #grad_op_type; }           \
+  };                                                                           \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,     \
+                    op_maker_class);
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
 /**
  * Macro to register OperatorKernel.
@@ -218,7 +221,7 @@ class OpKernelRegistrar : public Registrar {
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b6fc0409d5cb22b13352df41b8e911c79bc4825a..b860fe6cac773d1e85adecc43f5dfec42b6c7661 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -173,3 +173,14 @@ TEST(OpRegistry, CustomChecker) {
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
+
+class CosineOpComplete : public paddle::framework::CosineOp {
+ public:
+  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
+  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
+};
+
+TEST(OperatorRegistrar, Test) {
+  using namespace paddle::framework;
+  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8b5560ffa1234145fb4291f5730f89fd7375ee15..2ca838f838ad0b9211a59bf9247c48d283484d50 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -25,7 +25,7 @@ Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
   return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
@@ -245,5 +245,12 @@ std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
   return res;
 }
 
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
+     << "]";
+  return os;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 310d68d7c1baac231a2f1709af28bfb58ae1a436..d7bc9c9ffb9d5e0a7d8ea309a50623da440820da 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "op_info.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
@@ -317,46 +318,170 @@ class ExecutionContext : public InferShapeContext {
   const platform::DeviceContext& device_context_;
 };
 
+class CompileTimeInferShapeContext : public InferShapeContextBase {
+ public:
+  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
+      : op_(op), block_(block) {}
+
+  bool HasInput(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    auto length = input_names.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVar(input_names[0]);
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    auto length = output_names.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVar(output_names[0]);
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
+    for (auto& input : input_names) {
+      if (!block_.HasVar(input)) return false;
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
+    for (auto& output : output_names) {
+      if (!block_.HasVar(output)) return false;
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetInputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
+
+  void SetInputDim(const std::string& name, const DDim& dim) override {
+    SetInputsDim(name, {dim});
+  }
+
+  DDim GetOutputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetOutputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetOutputsDim(name, {dim});
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Input(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Output(name);
+  }
+
+ private:
+  DDim GetDim(const std::string& name) const override {
+    return framework::make_ddim(block_.Var(name)->Shape());
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    block_.Var(name)->SetShape(framework::vectorize(dim));
+  }
+
+  const OpDescBind& op_;
+  const BlockDescBind& block_;
+};
+
 class RuntimeInferShapeContext : public InferShapeContextBase {
  public:
   RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
       : op_(op), scope_(scope) {}
 
-  bool HasInput(const std::string& name) const {
+  bool HasInput(const std::string& name) const override {
     auto ipt = op_.Input(name);
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
 
-  bool HasOutput(const std::string& name) const {
+  bool HasOutput(const std::string& name) const override {
     auto ipt = op_.Output(name);
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
 
-  DDim GetInputDim(const std::string& name) const {
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
     return GetDim(op_.Input(name));
   }
 
-  void SetInputDim(const std::string& name, const DDim& dim) {
+  void SetInputDim(const std::string& name, const DDim& dim) override {
     SetDim(op_.Input(name), dim);
   }
 
-  DDim GetOutputDim(const std::string& name) const {
+  DDim GetOutputDim(const std::string& name) const override {
     return GetDim(op_.Output(name));
   }
 
-  void SetOutputDim(const std::string& name, const DDim& dim) {
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
     SetDim(op_.Output(name), dim);
   }
 
-  AttrReader Attrs() const { return AttrReader(op_.Attrs()); }
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
 
-  const std::vector<std::string>& Inputs(const std::string& name) const {
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
     return op_.Inputs(name);
   }
 
-  const std::vector<std::string>& Outputs(const std::string& name) const {
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
     return op_.Outputs(name);
   }
 
@@ -377,11 +502,11 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
     return t;
   }
 
-  DDim GetDim(const std::string& name) const {
+  DDim GetDim(const std::string& name) const override {
     return GetTensor<false>(name)->dims();
   }
 
-  void SetDim(const std::string& name, const DDim& dim) {
+  void SetDim(const std::string& name, const DDim& dim) override {
     GetTensor<true>(name)->Resize(dim);
   }
 
@@ -452,9 +577,25 @@ class OperatorWithKernel : public OperatorBase {
     this->InferShape(&infer_shape_ctx);
 
     ExecutionContext ctx(*this, scope, dev_ctx);
-    auto& opKernel = AllOpKernels().at(type_).at(
-        OpKernelKey(IndicateDataType(ctx), dev_ctx));
-    opKernel->Compute(ctx);
+
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(type_);
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel", type_);
+    }
+
+    // check if op[type] have kernel for kernel_key
+    OpKernelMap& kernels = kernels_iter->second;
+    auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+    auto kernel_iter = kernels.find(kernel_key);
+
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
+                   kernel_key);
+    }
+
+    kernel_iter->second->Compute(ctx);
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -471,9 +612,9 @@ class OperatorWithKernel : public OperatorBase {
                        });
   }
 
- protected:
   virtual void InferShape(InferShapeContextBase* ctx) const = 0;
 
+ protected:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
   virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
@@ -503,5 +644,8 @@ class OperatorWithKernel : public OperatorBase {
   }
 };
 
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 06ffcd4b15078f62ea8b7a3714e73de799530785..9b34a06aeff94e6fa855f6f287a73889e2a4faee 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -26,9 +27,6 @@ class ProgramDescBind {
  public:
   static ProgramDescBind &Instance(ProgramDesc *prog);
 
-  ProgramDescBind(const ProgramDescBind &o) = delete;
-  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
-
   BlockDescBind *AppendBlock(const BlockDescBind &parent);
 
   BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
@@ -46,6 +44,8 @@ class ProgramDescBind {
   ProgramDesc *prog_;
 
   std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+
+  DISABLE_COPY_AND_ASSIGN(ProgramDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index c93b03e48130afe9568089b6a7586c4185d1d5b4..7047f0d55e9844aec19892631fe4b5b387bf89ca 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "paddle/framework/variable.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -38,11 +39,6 @@ class Scope {
   Scope() {}
   ~Scope();
 
-  // Disable Copy, Assign, Move.
-  Scope(const Scope& other) = delete;
-  Scope& operator=(const Scope& other) = delete;
-  Scope(Scope&& other) = delete;
-
   /// Create a sub-scope. Returns a reference other than a pointer so
   /// to prevent from manual deletion.
   /// Mark it to const because that new kid scope cannot change parent scope.
@@ -73,6 +69,8 @@ class Scope {
   std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
+
+  DISABLE_COPY_AND_ASSIGN(Scope);
 };
 
 }  // namespace framework
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index b07fc788124413f728c713027609d9d2d1c39538..74e0371e328114294d7f85932b1e551c21ff5b97 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -19,11 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// TODO(longfei): Once after both CompileTimeInferShapeContext and
+// RuntimeInferShapeContext get merged, we can rename InferShapeContextBase into
+// InferShapeContext so to replace the current InferShapeContext.
 class InferShapeContextBase {
  public:
   virtual ~InferShapeContextBase() {}
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
+
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
+
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
     const std::vector<std::string> &names = Inputs(name);
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2728bce1c1af848285e80d8ee8b3b61ec046342e
--- /dev/null
+++ b/paddle/framework/tensor_array.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+
+
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <glog/logging.h>
+#include <algorithm>
+#include <limits>
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+/*
+ * Offer an iterator over the length-sorted lod-tensor's top level. The top
+ * level of a lod-tensor stores batch-size of sequences, each top-level sequence
+ * may contains several lower-level sequences, sort top-level lod by the numbers
+ * of lower-level sequences in descending order, so that during RNN's running,
+ * the batch-size will keep decreasing, the short sentences will end at the tail
+ * of each batch.
+ *
+ * Let's take a simple lod-tensor for example
+ *
+ *   |(0)       |(1)        top-level has two instances
+ *   |||        |||||    lower-level
+ *
+ * sort by lower-level's length
+ *
+ *   |(1)       |(0)
+ *   |||||      |||
+ *
+ * when RNN runs, it get 5 batches (equals the number of elements the longest
+ * sequence has)
+ *
+ * |||||
+ * |||
+ *
+ * the first three batches has two elements, the last two elements just has 1
+ * element each.
+ */
+struct DynamicBatchUnpacker {
+  using value_type = float;
+
+  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
+                       bool descend = true)
+      : source(&source), level(level) {
+    BuildLengthSortedMeta(descend);
+  }
+
+  LoDTensor GetBatch(size_t index);
+
+  std::vector<DySeqMeta> meta;
+
+  LoDTensor const* source;
+  size_t level;
+
+ protected:
+  void BuildLengthSortedMeta(bool descend);
+};
+
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level);
+
+}  // namespace detail
+
+const LoDTensor& TensorArray::Read(size_t index) const {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+  return values_[index];
+}
+
+void TensorArray::Write(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].Resize(value.dims());
+  values_[index].mutable_data<value_type>(platform::CPUPlace());
+  values_[index].CopyFrom<value_type>(value, platform::CPUPlace());
+}
+
+void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].ShareDataWith<value_type>(value);
+}
+
+LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
+                            const LoD& lod) const {
+  return detail::PackDynamicBatch(values_, meta, lod, level);
+}
+
+std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
+                                           bool length_desend) {
+  detail::DynamicBatchUnpacker unpacker(source, level,
+                                        length_desend /*descend*/);
+
+  // find max length of all the sequences
+  size_t max_length = 0;
+  for (const auto& seq : unpacker.meta) {
+    max_length = std::max(max_length, seq.end - seq.begin);
+  }
+
+  // write batches to values
+  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
+    Write(batch_id, unpacker.GetBatch(batch_id));
+  }
+
+  return unpacker.meta;
+}
+
+LoDTensor TensorArray::Stack() const {
+  LoDTensor result;
+  if (size() == 0) return result;
+
+  const auto& first_dims = values_.front().dims();
+  // check all the values have the same shape
+  // TODO(superjom) check the same dtypes
+  for (size_t idx = 1; idx < size(); idx++) {
+    const auto& value_dims = values_[idx].dims();
+    PADDLE_ENFORCE_EQ(first_dims, value_dims);
+  }
+
+  // copy
+  auto result_dims = vectorize(first_dims);
+  result_dims.insert(result_dims.begin(), size());
+  result.Resize(make_ddim(result_dims));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t idx = 0; idx < size(); idx++) {
+    result.Slice<value_type>(idx, idx + 1)
+        .CopyFrom<value_type>(Read(idx), platform::CPUPlace());
+  }
+  return result;
+}
+
+void TensorArray::Unstack(const LoDTensor& source) const {
+  Unstack(source, false /*data_shared*/);
+}
+
+void TensorArray::UnstackShared(const LoDTensor& source) const {
+  Unstack(source, true /*data_shared*/);
+}
+
+void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
+  size_t first_dim = source.dims()[0];
+  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
+  PADDLE_ENFORCE_GT(first_dim, 0,
+                    "source should have some data to be unstacked");
+
+  values_.resize(first_dim);
+
+  for (size_t elem = 0; elem < first_dim; elem++) {
+    // create a new value
+    auto& value = values_[elem];
+    if (data_shared) {
+      // share memory
+      value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
+    } else {
+      // copy
+      value.Resize(value_dims);
+      value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
+                                 platform::CPUPlace());
+    }
+  }
+}
+
+size_t TensorArray::size() const { return values_.size(); }
+
+namespace detail {
+
+void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
+  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
+  // collect meta for each sequence in some level
+  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
+
+  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
+    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
+    meta.push_back(seq_meta);
+  }
+
+  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
+
+  // sort by length
+  sort(meta.begin(), meta.end(),
+       [descend](const DySeqMeta& a, const DySeqMeta& b) {
+         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
+         return descend ? a_ge_b : !a_ge_b;
+       });
+}
+
+LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
+  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
+  LoDTensor result;
+
+  // collect indice need to copy to the batch
+  std::vector<size_t> indice;
+  for (const auto& seq : meta) {
+    size_t id = seq.begin + index;
+    if (id >= seq.end) break;
+    indice.push_back(id);
+  }
+  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
+
+  // copy the indice of records in LoDTensor
+  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t i = 0; i < indice.size(); i++) {
+    auto index = indice[i];
+    auto target = result.Slice<value_type>(i, i + 1);
+    auto source_ = source->Slice<value_type>(index, index + 1);
+
+    target.CopyFrom<value_type>(source_, platform::CPUPlace());
+  }
+
+  return result;
+}
+
+// TODO(supejom) to cache lod if reasonable
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level) {
+  PADDLE_ENFORCE(!source.empty());
+  PADDLE_ENFORCE(!meta.empty());
+  PADDLE_ENFORCE(!lod.empty());
+
+  LoDTensor result;
+
+  // init result space
+  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  auto height = lod[level].back();
+  record_dims_vec.insert(record_dims_vec.begin(), height);
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<float>(platform::CPUPlace());
+
+  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
+    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
+      const auto& seq_meta = meta[seq_id];
+      // source is source[batch_id][seq_id]
+      // target is result[index]
+      auto index = seq_meta.begin + batch_id;
+      if (index >= seq_meta.end) break;
+      auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
+      auto target = result.Slice<float>(index, index + 1);
+      target.CopyFrom<float>(source_, platform::CPUPlace());
+    }
+  }
+
+  result.set_lod(lod);
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..94a14c2df492b175cf6a643800937878e95c5f37
--- /dev/null
+++ b/paddle/framework/tensor_array.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
+ * after lod-tensor's re-assembling, its info can be used to recover the order
+ * in original lod-tensor.
+ */
+struct DySeqMeta {
+  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
+      : begin(begin), end(end), ori_idx(ori_idx) {}
+
+  size_t begin;
+  size_t end;  // not included
+  size_t ori_idx;
+};
+
+/*
+ * TensorArray is a C-array-like array of tensors, it is meant to be used with
+ * dynamic iteration primitives such as while_loop. It is used to segment inputs
+ * and store states in all time steps.
+ *
+ * By providing some methods similar to a C++ array, the difinition of some
+ * state-based dynamic models such as RNN cound be more natural and highly
+ * flexible.
+ */
+class TensorArray {
+ public:
+  using value_type = float;
+
+  // max number of values allowed to store.
+  const size_t MAX_SIZE{100000};
+
+  /*
+   * Read the value at location `index` in the `TensorArray`.
+   */
+  const LoDTensor &Read(size_t index) const;
+
+  /*
+   * Write value into the index of the TensorArray.
+   */
+  void Write(size_t index, const LoDTensor &value);
+
+  /*
+   * Write value into the index of the TensorArray, with memory shared.
+   */
+  void WriteShared(size_t index, const LoDTensor &value);
+
+  /*
+   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
+   * `indice_map`.
+   */
+  LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
+                 const LoD &lod) const;
+
+  /*
+   * Split LoDTensor in some `level` and write the generated batches to
+   * `values`, if set `desend`, will sort by length in descending order else in
+   * ascending order.
+   */
+  std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
+                                bool length_desend);
+
+  /*
+   * Pack the values into a tensor with rank one higher than each tensor in
+   * values.
+   */
+  LoDTensor Stack() const;
+
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
+   */
+  void Unstack(const LoDTensor &source) const;
+
+  /*
+   * Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
+   * with memory of tensors shared.
+   */
+  void UnstackShared(const LoDTensor &source) const;
+
+  /*
+   * Return the number of values.
+   */
+  size_t size() const;
+
+ protected:
+  void Unstack(const LoDTensor &source, bool data_shared) const;
+
+ private:
+  mutable std::vector<LoDTensor> values_;
+};  // class TensorArray
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9f52509cdd1b79f6d53b5d4922f9e44279de08b
--- /dev/null
+++ b/paddle/framework/tensor_array_test.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+class TensorArrayTester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    LoDTensor source;
+    source.Resize(make_ddim({batch_size, dim}));
+    int* data = source.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 16 * 32; i++) {
+      data[i] = i;
+    }
+    ta.Unstack(source);
+  }
+
+  TensorArray ta;
+  const int batch_size = 16;
+  const int dim = 32;
+};
+
+TEST_F(TensorArrayTester, Read) {
+  for (int i = 0; i < batch_size; i++) {
+    const auto& tensor = ta.Read(i);
+    ASSERT_EQ(tensor.dims()[0], 1);
+    ASSERT_EQ(tensor.dims()[1], dim);
+  }
+}
+
+TEST_F(TensorArrayTester, Write) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.Write(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+}
+
+TEST_F(TensorArrayTester, WriteShared) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.WriteShared(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+
+  EXPECT_EQ(source.data<int>(), tensor.data<int>());
+}
+
+class TensorArrayPackTester : public ::testing::Test {
+ protected:
+  virtual void SetUp() override {
+    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
+
+    source.set_lod(lod);
+    source.Resize(make_ddim({13, 128}));
+    source.mutable_data<int>(platform::CPUPlace());
+
+    // content of each setence: 0 1 2 3 4
+    const auto& level = lod.front();
+    for (size_t i = 0; i < level.size() - 1; i++) {
+      size_t begin = level[i];
+      size_t end = level[i + 1];
+      for (size_t j = begin; j < end; j++) {
+        auto record = source.Slice<int>(j, j + 1);
+        for (int dim = 0; dim < 128; dim++) {
+          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
+        }
+      }
+    }
+
+    // unpack
+    meta = ta.Unpack(source, 0, true);
+  }
+
+  LoD lod;
+  TensorArray ta;
+  LoDTensor source;
+  std::vector<DySeqMeta> meta;
+};
+
+TEST_F(TensorArrayPackTester, Unpack) {
+  ASSERT_EQ(ta.size(), 7UL);
+
+  const auto& t0 = ta.Read(0);
+  const auto& t1 = ta.Read(1);
+
+  ASSERT_EQ(t0.data<int>()[0], int(0));
+  ASSERT_EQ(t1.data<int>()[0], int(1));
+}
+
+TEST_F(TensorArrayPackTester, Pack) {
+  LoDTensor packed = ta.Pack(0, meta, lod);
+}
+
+TEST_F(TensorArrayTester, size) {
+  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index a5405f9c31543b5733f9db923c2a6f8b968cfc2d..379eac94f985c9fa10b6c773065551575f57f033 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -65,7 +65,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
       holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
           boost::get<platform::CPUPlace>(place), size));
     } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
 #else
@@ -103,7 +103,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src_place) &&
            platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index e2ec738de35c90c6a06c9a46b062d4cce55f5eda..58cf0fc3cb6cf0bad693118ca57d71fb21c55a40 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -126,7 +126,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -163,7 +163,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -218,7 +218,7 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor gpu_tensor;
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5b94722136eca7ac0a22eb7ef113532330428ab
--- /dev/null
+++ b/paddle/framework/type_defs.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class OpDescBind;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+using GradOpMakerFN =
+    std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index a89b6bba45843d81264819cad6ba053f28314f6b..bd0fe119ce46df9c333258c9c1ad7b5b2bdc544f 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -194,7 +194,7 @@ public:
 
 REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index b87750b74247bd0eb822340bc5a85d41b86ecec2..23916c0f4b6319004ca0793bc9305b8a1dd0ae89 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     CPU,
                     ContextProjectionBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     GPU,
                     ContextProjectionForwardFunc);
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 7ece7b2dfedaf460741c97b5a700eb632d85cabc..2e5c281f37d8ffb1062121b5dc5b4f790ab52089 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -233,7 +233,7 @@ private:
 
 REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
 #endif
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index f12ee43e3d72f9ac776eaff93914228850694dd2..46f98f12c1f150fdf3ed53a7a96e5cf0020e14a4 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -169,7 +169,7 @@ private:
 
 REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
 #endif
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ef878bfbba961bdd3d5212e19fb83bb1e285e47f..9e88669d37bd50179dcc0464e8c1cd6e2fed74db 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -336,7 +336,7 @@ private:
 
 REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
 #endif
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 2f3112fe657cd381891dc53c7179e7520911e8c9..9863e3ae1d5fcb1eece5267fd4f2a6b593b799df 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
 REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
                     CPU,
                     DepthwiseConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
 REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
                     GPU,
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
index d8e8c889d5c23bf9b2b5fd0b0393395883188fd8..b1a90da7db2b647dd384e3772820294140e5ec9d 100644
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(DepthwiseConv, Forward) {
   DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "DepthwiseConv-GPU", forward);
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index f8cf4ebea8d724f0291b981647622b63e3d84495..bdb56ddac38b91d756fc6f31282f29c0489fd660 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -340,7 +340,7 @@ public:
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
index 5283d79a5a53d979ae4e134f7e46b7ee106e9c44..b5b5e1f35b79e422b14f7495bc321533cc1d618a 100644
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) {
       "NaiveConv-CPU", "GemmConv-CPU", forward);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GemmConv, Forward) {
   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "GemmConv-GPU", forward);
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index acc88a553abe7ac58b629aba9b850df58cee7f81..a0a01a5fc7fc055dce6ddb3ee51c7ab18f8a4ca7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -116,7 +116,7 @@ void TestIm2ColFunctor() {
 
 TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 25e41edad54bec0f76a3de4799fab14241407272..704a8c41325ef86067a3bd8ed6d772b77df147c5 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -341,7 +341,7 @@ private:
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index adba7c92ece505eecc74edce6b393cf27fa10ccc..eed2f2e3089b6b6167ef7c5a7acb7ecaa08945e1 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -207,7 +207,7 @@ private:
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
 #endif
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index b6501e8f4db7fd33891cd80e07a6f36dd0b34532..7c802d66273c6f7aa56b2f460e3dff4401967517 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -217,7 +217,7 @@ public:
 
 REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
 #endif
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 01e252a8dc0cd5fa1e964efa01d04cf282b3dfe7..597723a2dded6a6a116e05b7d4c942cd633e2c99 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -132,7 +132,7 @@ public:
 
 REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
 #endif
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index f7a80e23e1bd49549bec57b360587adc6b423794..bc7d1c83a48aefeb4bc6d3baa32b78aba712e58d 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnBatchNormLayer.h"
 #endif
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index 412762d38475422be98ffeb87ffcfb028c3e035f..dacff25e5927daf9c991577a71be86b160228317 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "hl_batch_transpose.h"
 #endif
 #include "BatchNormalizationLayer.h"
@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
   size_t batchSize = in->getHeight();
   CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
   }
   CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 96d5c54accc047b685502a178de2d290f3158731..7b932d5a76e9c4fe7cbe5882bbc19eb3de4b503a 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnPoolLayer.h"
 #endif
 namespace paddle {
@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
   if (pool == "max-projection" || pool == "avg-projection") {
     return new PoolProjectionLayer(config);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a38880e14cdfcef05461dae567d198e5400c6bb1..cd957c7c0bca4c6089cc07e8f4226b8260190f07 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
                          bool useGpu,
                          bool useWeight,
                          float epsilon) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 38bcbb880d6fc8a0889cc90a18b148124b1f2a8f..41116f480957153eca33d211d09095903d6a00d9 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -118,7 +118,7 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void batchNormInference(int n, int c, int h, int w) {
   MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
   MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index bcc10a6197e5cfd599285eb8cb6fc30a981f5176..8634355b5206f5cde0aa0717df50ade39e173ae7 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -116,7 +116,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
 }
 
 TEST(Layer, convParaUnified) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   MatrixPtr input, resultCpu, resultGpu;
 
   /// TEST1 for conv ///
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
index af43dc51fad35c834635b543b1a016f6d717de1e..dc39c97a87f8b346dc9cc09d6158b1b4069bcf2d 100644
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) {
                            useGpu,
                            result2);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   // GPU case 1.
   useGpu = true;
   inputLoc = Matrix::create(1, 16, false, useGpu);
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 93996392d221d531f65caf465decbffdbc2d0384..62a131171fa5ae973cb3069151a582aaeac9ee0e 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
                    string testEvaluatorName,
                    size_t batchSize,
                    bool useGpu) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
index 483e382f6d3f42d0c7ce2cf194c4e02c6c4621b1..ffe5cfb8dbb55d0b70a5699969abaa101f05f9ce 100644
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -96,7 +96,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
       Matrix::create(subSeqStartPosition.back(), 1, false, false);
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 876a935fb0784b1c8d840f23dc0014914bfbdc06..1a46fb49153a0aa4228f58db481b950bc2d6de83 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #endif
 #include <gtest/gtest.h>
@@ -257,7 +257,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                      true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Projection, conv) {
   /// test ConvProjection
   testProjectionConv(1, false);
@@ -421,7 +421,7 @@ TEST(Layer, depthwiseConvLayer) {
   //  'depthwise_conv' is a sepecial case of 'exconv' whose
   //  groups size equals to the input channels size.
   testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
@@ -479,7 +479,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, convLayer) {
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -524,7 +524,7 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -637,7 +637,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
                 /* trans= */ false,
                 /* useGup= */ false,
                 false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testLayerGrad(config,
                 "selective_fc",
                 100,
@@ -1209,7 +1209,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   testLayerGrad(config, "pool", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
@@ -1235,7 +1235,7 @@ TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
@@ -1308,7 +1308,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, Pool3DLayer) {
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -1694,7 +1694,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, BatchNormalizationLayer) {
   testBatchNormLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNormLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNormLayer("cudnn_batch_norm", false, true);
@@ -1743,7 +1743,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, testBatchNorm3DLayer) {
   testBatchNorm3DLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNorm3DLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNorm3DLayer("cudnn_batch_norm", false, true);
@@ -2261,7 +2261,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DConvLayer) {
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2338,7 +2338,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DDeConvLayer) {
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 857d07df3e3088be28943d9e2fe58017e9e57f4a..a70b2f17f4f1130322f3c50d244f70fdcf34468b 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -215,13 +215,13 @@ struct testActDesc {
 static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
   cfg.biasSize = 0;
   cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ih * pm.ih * pm.iw;
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
   cfg.layerConfig.set_size(layerSize);
   cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
   cfg.layerConfig.add_inputs();
 }
 
-void testActivation(std::string& actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testActDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
     return;
@@ -240,6 +240,7 @@ TEST(MKLDNNActivation, Activations) {
   for (auto type : types) {
     /* bs, c, h, w*/
     testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
   }
 }
 
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index d36f72360f8ebd2033fb3e8c0e1b30911abba362..2b92211936aad1a034369bda0830bed3438cf401 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) {
   compareNetwork(config_file_a, config_file_b);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
   std::string config_file_b = "./gserver/tests/img_pool_b.conf";
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index ae0e3bc3d24c54eb84c7b5f5053e629607ef4310..8dc5568784295b5a2e7d4decd178d612432a1a18 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
                     useGpu,
                     result);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   // reset the input parameters
   variance[1] = 0.1;
   variance[3] = 0.2;
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index e11bf402c27898b8fdbd3fceeb8aeff8906352db..af6472619d1840e82787974d265d601b4a406c09 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) {
       for (int numConstantSlots : {1, 2}) {
         for (int useGpu : numTwoArray) {
           for (int dataCompression : numTwoArray) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index db883543c306c1938eb9da188ce20ed768018efb..fe54799259d86064c4fcaec0e53707247981a1b4 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
   config.clear_files();
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
   EXPECT_EQ(config.IsInitialized(), true);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 1da935dfc9f328f4a3b59ee336c64a94a8b68633..d164e382c4a804aef2417135b64cf709474d12f1 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -320,7 +320,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
       "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     if (useGpu) {
       break;
     }
@@ -387,7 +387,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                           outMatSelfc->getWidth(),
                           outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -417,7 +417,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
   MatrixPtr cpuOutMatFc(
       new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -442,7 +442,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
   selLayerConfig.set_size(fcLayerWidth);
 
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
index 8e04ccd16a8860f61de4ef924657cd484d074294..3dbffc563462973bdc1da529d486b2a2d5a677d3 100644
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -194,7 +194,7 @@ TEST(Layer, SeqSliceLayer) {
   vector<vector<real>> ends;
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
   genSeqInfo(seqStartPos, subSeqStartPos);
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 55427e2f12fd7b77c6eea1f65b3229e6fd29d71d..da829460061d38f363317e33daeb65cfa705bb8e 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
     for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
           if (useGpu) continue;
 #endif
           LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 0023b4d0f5da500f380ecb836b7c54e050b13d67..c3e34d5309d9ca8a32d7b0a8043e668cdb5be54b 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
 }
 
 void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
 }
 
 void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) {
 }
 
 void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index dbb829c4e24a659e4a97c0a3ba4c5c78b68815d3..9ef5b89680b00981188d78cb312dc75e2c0a79ee 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -99,7 +99,11 @@ public:
   /**
    * @brief clear local buffer. It only affect auto-growth buffer.
    */
-  inline void clear() { rowStore_.clear(); }
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
 
   /**
    * @brief get current number of rows.
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 6370c77386688a334fa0de8b4e2b272882e9e2b0..284b68d590ba655395c0186d8ea86d6855c6fc50 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
 }
 
 void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index eb87ee9bb7936d27c0c32a1a4b35ff49871c0a10..ff72672e3ab77212b309fcfea835839a916fa632 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
 
 template <class T>
 void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_vector_select_from<T>(this->getData(),
                            this->getSize(),
                            src.getData(),
@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
                                 size_t size)
     : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   SyncedFlag* flag = src.getSync();
   if (*flag == DATA_AT_CPU) {
     src.copyToGpu();  // will set synchronous data between CPU and GPU
@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
   cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
       size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
   gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
       size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 1ca70ea84c867b83013625eaee141f5b75fad4ae..1fecf659e5080c7d25f5f76b92b15f75eaab6ce3 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -68,7 +68,7 @@ void testPoolAllocator() {
 
 TEST(Allocator, Pool) {
   testPoolAllocator<CpuAllocator>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolAllocator<GpuAllocator>();
 #endif
 }
@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
   EXPECT_EQ(ptr1, ptr2);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MemoryHandle, Gpu) {
   int numGpu = hl_get_device_count();
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 22ce39701fca7b650fc03794cb0701e0987d2dae..1766257860b0b13e9f0ce898438e7c2d644f545e 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
  * implementation of CPU and GPU member function in
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 58bc43a38ba9465a832fcd0652e6309c403577e3..c72f89c8244b1209e490b09387c2ee6352426ce1 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 04c856453d2ec4ad764e37ae430e3e30ac0dea0b..25e0ba11ded96dd78aedc3c297507d0555d80d74 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index e6b5dba446b5a0022ade76b188895c4e0e2a22b4..9402bd3ec48fbed381ef1f676e8b179cabd4cb9f 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 1c21da5b76e95603258a5006d0c57b00126e65b9..2f99fa3581e14b91acc0b294856619f4ae2b3483 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithArg to compares the
  * implementation of CPU and GPU member function in Matrix.cpp.
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index c0572dfdbf738a4dfad04811b3a3e1b65487ff6d..8abbe8d82e02b7d1738fe7e6d0c8d494166e7892 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -47,7 +47,7 @@ struct MatrixPara {
   SparseFormat format;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void test_sparse_matrix_mul(MatrixPara paraA,
                             MatrixPara paraB,
                             MatrixPara paraC) {
@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 31b693afa8bd50f77a8efb67769e6215dd755bd3..d03698dee25fdd6dd49f2a3fdb5c605333440f49 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -270,7 +270,7 @@ TEST(Unary, BaseOp) {
   TestUnaryVectorT<CpuIVector, int> testCpuIVector(
       testUnaryBaseOpInt<CpuIVector>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
   TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
   TestUnaryVectorT<GpuIVector, int> testGpuIVector(
@@ -317,7 +317,7 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, MathOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
 #endif
 }
@@ -374,7 +374,7 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, CompareOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
 #endif
 }
@@ -536,7 +536,7 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, BaseOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -710,7 +710,7 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, MathOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
 #endif
 }
@@ -810,7 +810,7 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, CompareOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -955,7 +955,7 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, BaseOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -1058,7 +1058,7 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, CompareOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -1086,7 +1086,7 @@ void testQuaternaryAdd(
 TEST(Quaternary, BaseOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
 #endif
 }
@@ -1156,7 +1156,7 @@ void testQuaternaryCompareOp(
 TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 4a88844b43ef40af988d2b391d2bef4568dea9b7..5ae0aa036f6bfc1e5bd4e955277c4efff8c739ce 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
 typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
 
 void testCase(testMatrixFunc matrixFunc) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   for (auto useGpu : {false, true}) {
 #else
   for (auto useGpu : {false}) {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 4eb9837909ffaaf0f483ab65ece7a0b29fd49319..b70a61976402fd0a7cfee8382fd926fcf28486d5 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   const int nx = 100;
   const int ny = 50;
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 92afab4ff7f5ff4acc219c5ac783733340c5726a..04f23cff55db45c39049538545430bc5996cce5d 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -72,7 +72,7 @@ void testLazyAssign(int height, int width) {
 
 TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
 #endif
 
@@ -142,6 +142,6 @@ void testSgdUpdate(int height, int width) {
 
 TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 061fb22e3fd744d9d9895fd1008089e4a6ce6a0f..7e5a1db44a5302e3b4e5d2768755824666e880ba 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 60ebae015381a3901c14d0cd4c1225e54ac5726f..c7c07c817a08d78ddcbf8218e8c4a9d22f4990bc 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index a9185a4b24b13ca0287b0f67375c4599e8b9ac78..2b2a391b9d04a9f7fa4986a6b6dd5cd8e5385f1f 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
 //  so disable when
 /// only cpu version.
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index bb44970109c05d239e6b92d90b2079b752fa0104..fdc5ed19dc2973e744676c3b795c8ab86da58590 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -175,7 +175,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
       // Compute the maximum allocation size for the first allocation.
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index a270bd59581520859d43cddd2fc0cfa72080f46d..6c9a46dd09c15347fca1a30971e7e732d887bc8e 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 82ba322e057575c460b1d51d719c9b0fa459273e..ee9b012f91a9647839cf465c4074082f2d3509a6 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index ba44e06ddb68e92e4086a8006b868557b0c89b50..cd563844e7fa23241bb0bb56d1365ef34826c4a8 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(a, 0);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index c96a697a7e022684688b31c05da43e52812100d8..790420a8ab41b1a61ee35dc086c8b95fa1a02019 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 2b9c0eada6e8406fc81baec7f331a8dd5b8b0ec1..0bccee58c3a22379c75523467e0c717b98b08bcf 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -33,7 +33,7 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 /**
  * \brief   Copy memory from one place to another place.
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 29bc26f9d3bca0e30896657431f9a9bb1dac0d1d..30ce8a82e16ed26a41b009ce5d52dd1a2a1b7c21 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -62,7 +62,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   using BuddyAllocVec = std::vector<BuddyAllocator*>;
@@ -77,7 +77,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 
   // GPU buddy allocator initialization
   std::call_once(gpu_allocator_flag, [&]() {
-    int gpu_num = platform::GetDeviceCount();
+    int gpu_num = platform::GetCUDADeviceCount();
     allocators.reserve(gpu_num);
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 53cc63a098d0802479e3a371717adb7596c249ed..0d402038a06f4ad93fd15946fc44aaeac58ada40 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 size_t align(size_t size, paddle::platform::GPUPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 21166354937c378dc3f295f9011d034eb24cfc7c..0fa1fca2bcd3117e1e9a6a54c343b2d0d8c3822b 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,6 +55,12 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    if ("${TARGET}" STREQUAL "pool_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
+    endif()
+
     # activation_op contains several operators
     if ("${TARGET}" STREQUAL "activation_op")
         set(pybind_flag 1)
@@ -97,12 +103,16 @@ set(DEPS_OPS
     recurrent_op
     cond_op
     cross_entropy_op
-    softmax_with_cross_entropy_op)
+    softmax_with_cross_entropy_op
+    sum_op)
+
+
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
-op_library(cross_entropy_op DEPS cross_entropy_function)
-op_library(softmax_with_cross_entropy_op DEPS cross_entropy_function softmax_function)
+op_library(cross_entropy_op DEPS cross_entropy)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(sum_op DEPS net_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 1e1d3cf7f7634e2e5a433025f175202bd6c4b40e..66e9d2c40138c26975f07cb544e54de6f00d6b09 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -69,6 +69,22 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LeakyReluOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Y", "Output of LeakyRelu operator");
+    AddComment(
+        "LeakyRelu activation operator, "
+        "leaky_relu = max(x, alpha * x)");
+    AddAttr<AttrType>("alpha", "The small negative slope")
+        .SetDefault(static_cast<AttrType>(0.02f));
+  }
+};
+
 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -81,6 +97,17 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of TanhShrink operator");
+    AddOutput("Y", "Output of TanhShrink operator");
+    AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)");
+  }
+};
+
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -206,120 +233,63 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SigmoidFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                            ops::SigmoidGradFunctor<float>>);
 
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    exp,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_CPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::ExpGradFunctor>);
 
 REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
 
 REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
+
+REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+            tanh_shrink_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
 
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_CPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::AbsGradFunctor>);
 
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
 
 REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
 
 REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SquareFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
 
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(softsign,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SoftsignFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    softsign_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                             ops::SoftsignGradFunctor<float>>);
 
 REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+            leaky_relu_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
             soft_relu_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
 
 REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::CPUPlace, float>);
 
 REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::CPUPlace, float>);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      act_type,                                                                \
+      paddle::operators::ActivationKernel<paddle::platform::CPUPlace,          \
+                                          paddle::operators::functor<float>>); \
+  REGISTER_OP_CPU_KERNEL(act_type##_grad,                                      \
+                         paddle::operators::ActivationGradKernel<              \
+                             paddle::platform::CPUPlace,                       \
+                             paddle::operators::grad_functor<float>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 56886d8b1b93a19e9a01798ef79e89f9b5d6fca1..93e9f1c694bacba48c4f8c46f90fb5b512bead99 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -15,93 +15,14 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
 
-namespace ops = paddle::operators;
-
-REGISTER_OP_GPU_KERNEL(sigmoid,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SigmoidFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                            ops::SigmoidGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    exp,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_GPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::ExpGradFunctor>);
-REGISTER_OP_GPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_GPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::AbsGradFunctor>);
-
-REGISTER_OP_GPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SquareFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(softsign,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SoftsignFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    softsign_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                             ops::SoftsignGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)        \
+  REGISTER_OP_GPU_KERNEL(                                                      \
+      act_type,                                                                \
+      paddle::operators::ActivationKernel<paddle::platform::GPUPlace,          \
+                                          paddle::operators::functor<float>>); \
+  REGISTER_OP_GPU_KERNEL(act_type##_grad,                                      \
+                         paddle::operators::ActivationGradKernel<              \
+                             paddle::platform::GPUPlace,                       \
+                             paddle::operators::grad_functor<float>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index b9f52e1af3958b247e4854389cb467e2fce25e27..245060174224c5e24f75adf4ddc9a6db29101d74 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,9 +19,12 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, typename Functor>
-class ActivationKernel : public framework::OpKernel<T> {
+template <typename Place, typename Functor>
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
+  using T = typename Functor::ELEMENT_TYPE;
+
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
     auto* Y = context.Output<framework::Tensor>("Y");
@@ -31,13 +34,20 @@ class ActivationKernel : public framework::OpKernel<T> {
     auto y = framework::EigenVector<T>::Flatten(*Y);
     auto place = context.GetEigenDevice<Place>();
     Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     functor(place, x, y);
   }
 };
 
-template <typename Place, typename T, typename Functor>
-class ActivationGradKernel : public framework::OpKernel<T> {
+template <typename Place, typename Functor>
+class ActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
+  using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
     auto* Y = context.Input<framework::Tensor>("Y");
@@ -51,159 +61,228 @@ class ActivationGradKernel : public framework::OpKernel<T> {
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto place = context.GetEigenDevice<Place>();
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     functor(place, x, y, dy, dx);
   }
 };
 
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
 // sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
-struct SigmoidFunctor {
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
   }
 };
 
 template <typename T>
-struct SigmoidGradFunctor {
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * y * (static_cast<T>(1) - y);
   }
 };
 
 // exp(x) = e^x
-struct ExpFunctor {
+template <typename T>
+struct ExpFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.exp();
   }
 };
 
-struct ExpGradFunctor {
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * y;
   }
 };
 
 // relu(x) = max(x, 0)
 template <typename T>
-struct ReluFunctor {
+struct ReluFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.cwiseMax(static_cast<T>(0));
   }
 };
 
 template <typename T>
-struct ReluGradFunctor {
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
   }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-struct TanhFunctor {
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.tanh();
   }
 };
 
 template <typename T>
-struct TanhGradFunctor {
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (static_cast<T>(1) - y * y);
   }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x - x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * (x.tanh() * x.tanh());
+  }
+};
+
 // sqrt(x) = x^(1/2)
-struct SqrtFunctor {
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.sqrt();
   }
 };
 
 template <typename T>
-struct SqrtGradFunctor {
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     const Y y_conj = Eigen::numext::conj(y);
     dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
   }
 };
 
 // abs(x) = |x|
-struct AbsFunctor {
+template <typename T>
+struct AbsFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.abs();
   }
 };
 
-struct AbsGradFunctor {
+template <typename T>
+struct AbsGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * x.sign();
   }
 };
 
 // reciprocal(x) = 1 / x
 template <typename T>
-struct ReciprocalFunctor {
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = static_cast<T>(1) / x;
   }
 };
 
 template <typename T>
-struct ReciprocalGradFunctor {
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * static_cast<T>(-1) * y * y;
   }
 };
 
 // log(x) = natural logarithm of x
-struct LogFunctor {
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.log();
   }
 };
 
 template <typename T>
-struct LogGradFunctor {
+struct LogGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (static_cast<T>(1) / x);
   }
 };
 
 // square(x) = x^2
-struct SquareFunctor {
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.square();
   }
 };
 
 template <typename T>
-struct SquareGradFunctor {
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * static_cast<T>(2) * x;
   }
 };
 
+template <typename T>
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
+  }
+};
+
+template <typename T>
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  }
+};
+
 // softsign(x) = x / (1 + |x|)
 template <typename T>
-struct SoftsignFunctor {
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
     y.device(d) = x / (static_cast<T>(1) + x.abs());
@@ -213,7 +292,7 @@ struct SoftsignFunctor {
 // d(softsign(x))/dx = 1 / (1 + |x|)^2
 // Taken from https://en.wikipedia.org/wiki/Activation_function
 template <typename T>
-struct SoftsignGradFunctor {
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
     dx.device(d) =
@@ -221,153 +300,130 @@ struct SoftsignGradFunctor {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class BReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct SoftReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max);
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
+    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class BReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
-
-    dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+template <typename T>
+struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
+    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class SoftReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
-    y.device(place) = (static_cast<T>(1) + temp.exp()).log();
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(alpha * x);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class SoftReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
-    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
-    dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+template <typename T>
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
+    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class PowKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
-    Y->mutable_data<T>(context.GetPlace());
-
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = x.pow(factor);
+template <typename T>
+struct PowFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.pow(factor);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class PowGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
-
-    dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
+template <typename T>
+struct PowGradFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class STanhKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = scale_b * (scale_a * x).tanh();
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = scale_b * (scale_a * x).tanh();
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class STanhGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+template <typename T>
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
 
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
-    dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                         \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);          \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                      \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                   \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                   \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                   \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                      \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(log, LogFunctor, LogGradFunctor);                      \
+  __macro(square, SquareFunctor, SquareGradFunctor);             \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);      \
+  __macro(pow, PowFunctor, PowGradFunctor);                      \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);       \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);   \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor)
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8c93b4a193c169a4472ff20efca779ddc5c804
--- /dev/null
+++ b/paddle/operators/adadelta_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adadelta_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdadeltaOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
+                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
+                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredGradOut"),
+        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredUpdateOut"),
+        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "param and grad input of AdadeltaOp should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
+                      "Param and AvgSquaredGrad input of AdadeltaOp "
+                      "should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
+                      "Param and AvgSquaredUpdate input of AdadeltaOp "
+                      "should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
+  }
+};
+
+class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdadeltaOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("AvgSquaredGrad",
+             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredUpdate",
+             "(Tensor) Input expectation of squared parameter updates");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("AvgSquaredGradOut",
+              "(Tensor) Output expectation of squared gradient");
+    AddOutput("AvgSquaredUpdateOut",
+              "(Tensor) Output expectation of squared parameter updates");
+
+    AddAttr<float>("rho",
+                   "(float, default 0.95) Exponential decay rate "
+                   "for squared gradients.")
+        .SetDefault(0.95f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) Constant for "
+                   "numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Adadelta Updates Operator.
+
+This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
+adaptive learning rate method for gradient descent.
+
+Adadelta updates:
+
+avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
+param_update =  - sqrt((avg_squared_update + epsilon) /
+                       (avg_squared_grad_out + epsilon)) * grad
+avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
+param_out = param + param_update
+
+References:
+  [1] ADADELTA: An Adaptive Learning Rate Method
+      https://arxiv.org/abs/1212.5701
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/adadelta_op.cu
similarity index 80%
rename from paddle/operators/add_op.cu
rename to paddle/operators/adadelta_op.cu
index d9c6d20a6c320b59e57ed25da3dd8b093833f8c7..3af1c8c8e9861138a33b3156818f704c3b20363f 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -12,7 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/add_op.h"
+#define EIGEN_USE_GPU
+#include "paddle/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d29e15c43583bd447fbacb548a326f303f7d1463
--- /dev/null
+++ b/paddle/operators/adadelta_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdadeltaOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto avg_squared_grad_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
+    auto avg_squared_update_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float rho = ctx.Attr<float>("rho");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    // Squared gradient accumulator
+    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
+    // Squared updates accumulator
+    auto avg_squared_update = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto avg_squared_grad_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
+    auto avg_squared_update_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    avg_squared_grad_out.device(place) =
+        rho * avg_squared_grad + (1 - rho) * grad.square();
+    auto update =
+        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
+             .sqrt() *
+        grad;
+    avg_squared_update_out.device(place) =
+        rho * avg_squared_update + (1 - rho) * update.square();
+    param_out.device(place) = param + update;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea2ff3c50306c0b0db4c769129b6e7f2bab3a7ee
--- /dev/null
+++ b/paddle/operators/adagrad_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdagradOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdagradOp should have the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdagradOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Adaptive Gradient Algorithm (Adagrad).
+
+moment_out = moment + grad * grad
+param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have the epsilon attribute. It is added here for numerical stability 
+by avoiding division by zero.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(adagrad,
+                       ops::AdagradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/adagrad_op.cu
similarity index 72%
rename from paddle/operators/rowwise_add_op.cu
rename to paddle/operators/adagrad_op.cu
index 4a57f64c890ce99d6060faec6a4a01b107403344..a5b7951121360f78612f9008a522235104708112 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -13,11 +13,8 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/rowwise_add_op.h"
+#include "paddle/operators/adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    rowwise_add_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(adagrad,
+                       ops::AdagradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5d8f751d3527f89b96d4274328ba0bb5f6efa44
--- /dev/null
+++ b/paddle/operators/adagrad_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = moment + grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
deleted file mode 100644
index 3914d1323083ede6a7ea07e7b4ef76b9e4afd26d..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/add_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AddOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      "Two input of Add Op's dimension must be same.");
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class AddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AddOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of add op");
-    AddInput("Y", "The second input of add op");
-    AddOutput("Out", "The output of add op");
-    AddComment(R"DOC(
-Two Element Add Operator.
-
-The equation is: Out = X + Y
-)DOC");
-  }
-};
-
-class AddOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
-
-REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
deleted file mode 100644
index 75163032a1ff11a1f18cfd0a4ff7289ff0cb66bf..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-class AddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-
-    auto X = EigenVector<T>::Flatten(*input0);
-    auto Y = EigenVector<T>::Flatten(*input1);
-    auto Z = EigenVector<T>::Flatten(*output);
-
-    auto place = context.GetEigenDevice<Place>();
-
-    Z.device(place) = X + Y;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index aaffa6661fe4686d09f20f0f0682219772638202..2737104a205cbc1e18ce4a3a45592a416d38a874 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -14,12 +14,7 @@ limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
 
-#include <cstring>
-#include <sstream>
-
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/gather.h"
-#include "paddle/operators/net_op.h"
 #include "paddle/operators/scatter.h"
 
 namespace paddle {
@@ -31,175 +26,183 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
 
-void CondOp::CreateScope(const Scope& scope) const {
+framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
   auto sub_scopes_var = scope.FindVar("SubScopes");
   PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
                           "Output(SubScopes) of CondOp should not be null.");
   auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
   auto& sub_scope = scope.NewScope();
   sub_scopes->push_back(&sub_scope);
+  return sub_scope;
 }
 
-void CondOp::CreateIndexTensor(const Scope& scope) const {
+std::vector<framework::Scope*>& CondOp::GetSubScopes(
+    const framework::Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
+}
+
+LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
   auto index_tensors_var = scope.FindVar("IndexTensors");
   PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                           "Output(IndexTensors) of CondOp should not be null.");
   auto& index_tensors =
       *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
   index_tensors.push_back(LoDTensor());
+  return index_tensors.back();
 }
 
-void CondOp::InferShape(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
-
-  for (int i = 0; i < 2; ++i) {
-    // Create two sub scopes for true and false branches
-    // sub_scopes[0] for the true branch and sub_scopes[1] for the false
-    // branch
-    CreateScope(scope);
-
-    // Create two tensors for true and false indices
-    // index_tensors[0] for the true branch and index_tensors[1] for the false
-    // branch
-    CreateIndexTensor(scope);
-
-    PADDLE_ENFORCE(!Inputs("Xs").empty(),
-                   "Inputs(Xs) of CondOp can't be empty.");
-    for (auto& input : Inputs("Xs")) {
-      // Create a new tensor in sub-scope for input-type tensor
-      Variable* v = sub_scopes[i]->NewVar(input);
-      LoDTensor* sub_input = v->GetMutable<LoDTensor>();
-      sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
-    }
-
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
-      }
-    }
-
-    // each net calls InferShape
-    //    sub_net_op_[i]->InferShape(*sub_scopes[i]);
-  }
-
-  for (auto& output : Outputs("Outs")) {
-    LoDTensor* tensor_t_out =
-        sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    LoDTensor* tensor_f_out =
-        sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* tensor_out_var = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
-    LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    // check output size should be same
-    PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
-                      "Outputs not of the same shape");
-    tensor_out->Resize(tensor_t_out->dims());
-    // tensor_out->mutable_data<float>(tensor_out->dims(),
-    // platform::CPUPlace());
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-}
-
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
-  auto* sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
+std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
+    const framework::Scope& scope) const {
   auto* index_tensors_var = scope.FindVar("IndexTensors");
   PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                           "Output(IndexTensors) of CondOp should not be null.");
-  auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
+  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
+}
 
-  std::string cond_name = Input("Cond");
-  Variable* cond_var = scope.FindVar(cond_name);
+void CondOp::PrepareDataForSubnet(
+    const framework::Scope& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    // Create two sub scopes for true and false branches
+    //   sub_scopes[0] for the true branch
+    //   sub_scopes[1] for the false branch
+    AddSubScope(scope);
+    // Create two tensors for true and false indices:
+    //   index_tensors[0] for the true branch
+    //   index_tensors[1] for the false branch
+    AddIndexTensor(scope);
+  }
+
+  Variable* cond_var = scope.FindVar(Input("Cond"));
   PADDLE_ENFORCE_NOT_NULL(cond_var,
                           "Input(Cond) of CondOp should not be null.");
   const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
 
-  // Step 1: get the true/false index at runtime
-  // index_[0]: vector<int>, contains all index for cond[i] == true
-  // index_[1]: vector<int>, contains all index for cond[i] == false
-  for (int i = 0; i < 2; ++i) index_[i].clear();
+  // get the true/false index at runtime according to cond tensor
+  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
+  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
+  std::vector<std::vector<int>> index_vectors;
+  index_vectors.resize(BRANCH_NUM);
 
   const int* cond_data = cond->data<int>();
   for (int i = 0; i < cond->dims()[0]; ++i) {
     if (cond_data[i])
-      index_[0].push_back(i);
+      index_vectors[TRUE_BRANCH].push_back(i);
     else
-      index_[1].push_back(i);
+      index_vectors[FALSE_BRANCH].push_back(i);
   }
 
-  // put index_[0] and index_[1] into two tensors:
-  // index_tensor_[0] and index_tensor_[1]
-  DDim dim = paddle::framework::make_ddim({0});
-  for (int i = 0; i < 2; ++i) {
-    dim[0] = index_[i].size();
-    int* tmp_ptr =
+  // put index_vectors[0] and index_vectors[1] into two tensors:
+  // index_tensors[0] and index_tensors[1]
+  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
+    int* index_tensor_data_ptr =
         index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    index_tensors[i].Resize(dim);
-    memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
+           dim[0] * sizeof(int));
   }
 
-  // Step 2: collect data by calling gather
-  for (int i = 0; i < 2; ++i) {
-    // i= 0/i for True and False branches respectively
-    for (auto& input : Inputs("Xs")) {
-      // find Tensor
-      Variable* v = scope.FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
+  // create input in subscopes according to index_vectors
+  for (auto& input : Inputs("Xs")) {
+    Variable* var_parent = scope.FindVar(input);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
 
-      v = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = var_child->GetMutable<LoDTensor>();
 
       // Resize child
-      DDim dim = tensor_child->dims();
-      dim[0] = index_[i].size();
-      tensor_child->Resize(dim);
+      DDim dim = tensor_parent->dims();
+      dim[0] = index_tensors[i].dims()[0];
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
-                    tensor_child);
+      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
     }
   }
 
-  // Step 3: run
-  for (int i = 0; i < 2; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  // create output_tensors in subscope for sub_net
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->NewVar(var_name);
+      }
+    }
   }
+}
 
-  // Step 4: merge output results
+void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
+                                 const platform::DeviceContext& dev_ctx) const {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  const std::vector<framework::LoDTensor>& index_tensors =
+      GetIndexTensors(scope);
+
+  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
   PADDLE_ENFORCE(!Outputs("Outs").empty(),
                  "Outputs(Outs) of CondOp can't be empty.");
-  for (int i = 0; i < 2; ++i) {
-    // i= 0/i for True and False branches respectively
-    for (auto& output : Outputs("Outs")) {
-      // find Tensor
-      Variable* v = scope.FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
-
-      v = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
-
-      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+  for (auto& output : Outputs("Outs")) {
+    const LoDTensor* tensor_t_out =
+        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    const LoDTensor* tensor_f_out =
+        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
+
+    auto* var_out = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
+    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+
+    DDim true_dim = tensor_t_out->dims();
+    DDim false_dim = tensor_f_out->dims();
+    true_dim[0] = 0;
+    false_dim[0] = 0;
+    PADDLE_ENFORCE_EQ(true_dim, false_dim,
+                      "Outputs not of the same shape except the first dim");
+
+    DDim out_dim = tensor_t_out->dims();
+    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
+    tensor_out->Resize(out_dim);
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+
+  // merge output results:
+  // output_tensor = true_output_tensor + false_output_tensor
+  for (auto& output : Outputs("Outs")) {
+    Variable* var_parent = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = &var_child->Get<LoDTensor>();
+      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
                            tensor_parent);
     }
   }
 }
 
+void CondOp::Run(const Scope& scope,
+                 const platform::DeviceContext& dev_ctx) const {
+  PrepareDataForSubnet(scope, dev_ctx);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  }
+  MergeDataFromSubnet(scope, dev_ctx);
+}
+
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CondOpProtoAndCheckerMaker(framework::OpProto* proto,
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 9a88ee35f108204348baddc57e0c0d8e63c3fb6d..93121fb31be287794249b5a62386d5a8dd268a0c 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -40,8 +40,7 @@ class CondOp : public framework::OperatorBase {
          const framework::VariableNameMap& outputs,
          const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
-    index_.resize(2);
-    sub_net_op_.resize(2);
+    sub_net_op_.resize(BRANCH_NUM);
   }
 
   CondOp(const CondOp& o)
@@ -51,42 +50,44 @@ class CondOp : public framework::OperatorBase {
     PADDLE_THROW("Not implemented");
   }
 
-  void CreateScope(const framework::Scope& scope) const;
+  framework::Scope& AddSubScope(const framework::Scope& scope) const;
+  std::vector<framework::Scope*>& GetSubScopes(
+      const framework::Scope& scope) const;
 
-  void CreateIndexTensor(const framework::Scope& scope) const;
+  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
+  std::vector<framework::LoDTensor>& GetIndexTensors(
+      const framework::Scope& scope) const;
 
-  /*
-   * InferShape must be called before Run.
-   * FIXME(yuyang18): Since InferShape has been removed, this implementation
-   * could be wrong.
-   */
-  void InferShape(const framework::Scope& scope) const;
+  void PrepareDataForSubnet(const framework::Scope& scope,
+                            const platform::DeviceContext& dev_ctx) const;
+  void MergeDataFromSubnet(const framework::Scope& scope,
+                           const platform::DeviceContext& dev_ctx) const;
 
   /*
    * Set True Block
    */
   void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[0] = std::move(net);
+    sub_net_op_[TRUE_BRANCH] = std::move(net);
   }
 
   /*
    * Set False Block
    */
   void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[1] = std::move(net);
+    sub_net_op_[FALSE_BRANCH] = std::move(net);
   }
 
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override;
 
  private:
+  const int TRUE_BRANCH = 0;
+  const int FALSE_BRANCH = 1;
+  const int BRANCH_NUM = 2;
+
   // sub_net_op_[0]: subnet_t
   // sub_net_op_[1]: subnet_f
   std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-
-  // index_[0]: True_index;
-  // index_[1]: False_index;
-  mutable std::vector<std::vector<int>> index_;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index b165224b37fb091c094a823179256c3dd40a37c9..068c82f399316a1587d7322d8dab75823656800e 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::GPUPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 5ac0e8cc45f007d42f1b6d7f86333f5cbedb3ea8..7c422c81fc479fa2e317bdee1b66017096381d27 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -100,7 +100,7 @@ class FCOp : public NetOp {
 
       add_out = Output("AddOut");
       AppendOp(framework::OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
+          "elementwise_add", {{"X", {sum_out}}, {"Y", {Input("B")}}},
           {{"Out", {add_out}}}, {}));
     } else {
       if (Output("AddOut") != framework::kEmptyVarName) {
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d04ecd284226c7b4c6cdd5531915fee2d94ce61
--- /dev/null
+++ b/paddle/operators/gather.cu.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Place;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 92fb51ec17709bc6f8abb2f516a9240fb5dc3a77..052db49cb3c2594eca8b9a5e3716689480089703 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -24,49 +24,40 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// Implementation of CPU copy
-template <typename T>
-void CPUGather(const T* src, const int* indices, const int slice_size,
-               const int index_size, T* output) {
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = indices[i];
-    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
-  }
-}
-
-// Implementation of GPU copy:
-template <typename T>
-void GPUGather(const T* src, const int* index, const int slice_size,
-               const int index_size, T* output);
+using framework::Tensor;
 
 /**
+ * A thin wrapper for gathering on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
-            const paddle::framework::Tensor* index,
-            paddle::framework::Tensor* output) {
+void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  // Gathering
-  if (platform::is_cpu_place(place)) {
-    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
-                 output->data<T>());
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index da22bd0c52c27d7decd10e2e2b34fa38d0620da8..fe305337cbebd7c679ae1b8ee8aa2740472ee109 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -31,6 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GatherOp should not be null.");
 
+    auto index_dims = ctx->GetInputDim("Index");
+    PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
@@ -79,8 +81,5 @@ Out = X[Index]
 namespace ops = paddle::operators;
 REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
             ops::GatherGradOp);
-REGISTER_OP_CPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    gather_grad,
-    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..92219d6a433e6db0bb9886ed8670cbafaa843ff8
--- /dev/null
+++ b/paddle/operators/gather_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 073e566e8f6962d62cc1b738672843421dcb4ee5..8276ed0d3d8b676aafab45fae70942e78b72b8e6 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -23,29 +23,40 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename T>
 class GatherOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *Y = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
 
-    Y->mutable_data<T>(ctx.GetPlace());
-    Gather<T>(ctx.GetPlace(), X, Index, Y);
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class GatherGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     dX->mutable_data<T>(ctx.GetPlace());
-    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 0ae1e99452973feb6d085dd6ef51e2afca988f59..cbd86b87961ee24aa889e208de5ac38e03a33135 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,9 @@ TEST(Gather, GatherData) {
 
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
-  Gather<int>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index bd75b001cb87d914f6c56ea35dcb5013d68145b2..dad56731de80518e3bf9d2ec1ffdac9cb6bc92f0 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -47,7 +47,6 @@ class LstmUnitOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LstmUnitOpMaker(framework::OpProto* proto,
@@ -68,7 +67,7 @@ Equation:
   H = C * sigm(o)
 
 )DOC");
-    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+    AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
         .SetDefault(0.0);
   }
 };
@@ -93,9 +92,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
-            lstm_unit_grad, ops::LstmUnitGradOp);
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
+            ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index b1db0d53227148de53b04587b943945f8563346e..49ea550b6f49a13bf31d14321d7a9eb13a834d4b 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -89,7 +89,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
   }
 }
 
-template <typename T, typename AttrType = T>
+template <typename T>
 class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
     auto* c_tensor = ctx.Output<framework::Tensor>("C");
     auto* h_tensor = ctx.Output<framework::Tensor>("H");
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int b_size = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
@@ -120,7 +120,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T, typename AttrType = T>
+template <typename T>
 class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
     int N = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int block = 512;
     int n = N * D;
@@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                       ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                       ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 0dc9a7d9a7aae2e16bc4488731f572f43778baf8..a0ff498c1d3ed2aaa10f5473ef91de168c250649 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -32,7 +32,7 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -44,7 +44,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
     auto* c_tensor = ctx.Output<framework::Tensor>("C");
     auto* h_tensor = ctx.Output<framework::Tensor>("H");
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int b_size = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
@@ -75,7 +75,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -108,7 +108,7 @@ class LstmUnitGradKernel : public framework::OpKernel<T> {
     int N = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     for (int n = 0; n < N; ++n) {
       for (int d = 0; d < D; ++d) {
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 6bea9817f1b6c76e68e2a3023bb9eac591aa894f..a0ceb029e3abee2fe591325ffa3100168c3aa8e3 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,17 +1,13 @@
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
-      im2col.cu DEPS cblas device_context operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
-    nv_library(softmax_function SRCS softmax.cc softmax.cu
-      DEPS operator)
-    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
-      DEPS operator)
+    nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
+    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc
-      DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
-    cc_library(softmax_function SRCS softmax.cc DEPS operator)
-    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
+    cc_library(softmax SRCS softmax.cc DEPS operator)
+    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
 endif()
 
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index f0b8c885918afe7f80edc465c6d9be7c11ac066f..40bdbfe73351a609a4ab9fdc27ac5ff6710df2a2 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -71,7 +71,7 @@ void testIm2col() {
     context =
         new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
   } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
     context =
         new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
@@ -116,7 +116,7 @@ void testIm2col() {
 
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUPlace>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testIm2col<paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 22468a0c4a4b0aca343fe766c8c9d63393a338eb..9945ba101d719848aa0c06fa65629d59f167c083 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,7 +1,7 @@
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(math_function, notrans_mul_trans) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input1_gpu;
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b706529d8f1ed0d673904b81047a5614bd4cf23
--- /dev/null
+++ b/paddle/operators/math/pooling.cc
@@ -0,0 +1,463 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aeccd1f8e8855c51ad85016f0cb239b4c9c8fb0
--- /dev/null
+++ b/paddle/operators/math/pooling.cu
@@ -0,0 +1,635 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             T* output_data, const int channels,
+                             const int input_height, const int input_width,
+                             const int output_height, const int output_width,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_height, const int stride_width,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+
+    if (maxIndex != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, input_channels,
+                              input_height, input_width, output_height,
+                              output_width, ksize_height, ksize_width,
+                              stride_height, stride_width, padding_height,
+                              padding_width, pool_process);
+  }
+};
+
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width, pool_process);
+  }
+};
+
+template <typename T>
+class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width);
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::GPUPlace, double>; // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, T* output_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        pool_process);
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width, pool_process);
+  }
+};
+
+template <class T>
+class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width);
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::GPUPlace, double>;  // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..d214c689235ad4233d3e4e1c2aa0fdc993bf20c6
--- /dev/null
+++ b/paddle/operators/math/pooling.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+//////////////////////
+#define FLT_MAX __FLT_MAX__  //
+
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) { y /= poo_size; }
+};
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index ac9f3c4bf61bf8e13faa17387f1112756db9a100..0ba8197ab8b64649c8adcf67771ba01eca7f1d10 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/softmax.h"
 
@@ -19,6 +19,7 @@ namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::CPUPlace, float>;
+template class SoftmaxGradFunctor<platform::CPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 4c3df0550e7ca6f4310db1d35cc34d5c73a2dd16..99f988d51e4b16c3f3bfd9c76b411bb53619603e 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
@@ -21,6 +21,7 @@ namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::GPUPlace, float>;
+template class SoftmaxGradFunctor<platform::GPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index 225323f05ac9aacce55dfe4795315741ee2c8795..b7f627eee7f8fe68a83595a3390a55d438c97afb 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -68,6 +68,37 @@ class SoftmaxFunctor {
              .broadcast(one_by_class));
   }
 };
+
+template <typename Place, typename T>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<T>::From(*y);
+    auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+    auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto dot = (softmax * softmax_grad)
+                   .sum(along_class)
+                   .eval()
+                   .reshape(batch_by_one)
+                   .broadcast(one_by_class);
+    logits_grad.device(*context.GetEigenDevice<Place>()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index d799239d4ed6d230578c77921a1a454b476b63fa..2332c9546b037c94a5a6d30319abda8e23c2b3bb 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
   MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").NotInGradient();
+    AddOutput("Out", "The output of mean op");
     AddComment(R"DOC( Mean Operator
 )DOC");
   }
@@ -52,11 +52,27 @@ class MeanGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class MeanGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* grad_op = new framework::OpDescBind();
+    grad_op->SetType("mean_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean,
                        ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index ce049d4d7bd96a6758d71b381e6e6b4edbcc8b5c..7057dcbd6e375adef57d17a13afdfade67e938b6 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -49,9 +49,9 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left tensor of minus operator.").NotInGradient();
-    AddInput("Y", "The right tensor of minus operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
+    AddInput("X", "The left tensor of minus operator.");
+    AddInput("Y", "The right tensor of minus operator.");
+    AddOutput("Out", "The output tensor of minus operator.");
 
     AddComment(R"DOC(Minus Operator
 
@@ -64,26 +64,35 @@ or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
-template <typename AttrType>
-class MinusGradOp : public NetOp {
+
+class MinusGradMaker : public framework::GradOpDescMakerBase {
  public:
-  MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    auto out_grad = Input(framework::GradVarName("Out"));
-    auto x_grad = Output(framework::GradVarName("X"));
-    auto y_grad = Output(framework::GradVarName("Y"));
-
-    // x_grad = out_grad
-    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
-                                             {{"Y", {x_grad}}}, {}));
-
-    framework::AttributeMap scale_attr;
-    scale_attr["scale"] = static_cast<AttrType>(-1);
-    AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
-                                             {{"Out", {y_grad}}}, scale_attr));
-    CompleteAddOp(false);
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+      const override {
+    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *x_g_op = new framework::OpDescBind();
+      x_g_op->SetType("scale");
+      x_g_op->SetInput("X", OutputGrad("Out"));
+      x_g_op->SetOutput("Out", x_g);
+      x_g_op->SetAttr("scale", 1.0f);
+      ops.emplace_back(x_g_op);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *y_g_op = new framework::OpDescBind();
+      y_g_op->SetType("scale");
+      y_g_op->SetInput("X", OutputGrad("Out"));
+      y_g_op->SetOutput("Out", y_g);
+      y_g_op->SetAttr("scale", -1.0f);
+      ops.emplace_back(y_g_op);
+    }
+
+    return ops;
   }
 };
 
@@ -91,7 +100,6 @@ class MinusGradOp : public NetOp {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
-            ops::MinusGradOp<float>);
+REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
 REGISTER_OP_CPU_KERNEL(minus,
                        ops::MinusKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 9858c4d9c2195c7bd0e767aaa86a950e0a791443..3c8fe04d2edeccc0e0d55aa2a28d71085ccf5145 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
 
@@ -35,12 +35,14 @@ class MulOp : public framework::OperatorWithKernel {
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
     int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
 
-    PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
-                   "The rank of input tensor X should be larger than "
-                   "`mul_op`'s `x_num_col_dims`.");
-    PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
-                   "The rank of input tensor Y should be larger than "
-                   "`mul_op`'s `y_num_col_dims`.");
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), x_num_col_dims,
+        "The input tensor X's rank of MulOp should be larger than "
+        "x_num_col_dims.");
+    PADDLE_ENFORCE_GT(
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 04ebb14f6ee6c73f48aa2f75811a22f9b8a25006..15aa05f26610be14e4c37be35137a259e00eb947 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -56,8 +56,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddOutput("Out",
               "The output of pad op."
-              "A tensor with the same shape as X.")
-        .NotInGradient();
+              "A tensor with the same shape as X.");
     AddComment(R"DOC(
 Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
@@ -111,11 +110,29 @@ class PadOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class PadOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* bind = new framework::OpDescBind();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad_grad");
+    return std::unique_ptr<framework::OpDescBind>(bind);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(pad, ops::PadOp, ops::PadOpMaker, pad_grad, ops::PadOpGrad);
+
+REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
+REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
 REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(pad_grad,
                        ops::PadGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c29f51f05613832c838400eb114465c81290ea58
--- /dev/null
+++ b/paddle/operators/pool_op.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
+                   "pooling_type should be 'max' or 'avg'");
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D");
+
+    if (ctx->Attrs().Get<bool>("globalPooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and Pooling size should be consistent.");
+    PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3,
+                   "Pooling size should be 2 elements. or 3 elements.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input@Grad of Pooling should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCHW.");
+
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "Str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "Strides(height, width) of pooling operator."
+                              "Default {1,1}")
+        .SetDefault({1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>("paddings",
+                              "Paddings(height, width) of pooling operator."
+                              "Default {0,0}.")
+        .SetDefault({0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling2d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW. Where N is batch size, C is "
+             "the "
+             "number of channels, D, H and W is the depth, height and width of "
+             "feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCDHW.");
+
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "strides",
+        "Strides(depth, height, width) of pooling operator."
+        "Default {1,1,1}.")
+        .SetDefault({1, 1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "Paddings(depth, height, width) of pooling operator."
+        "Default {0,0,0}.")
+        .SetDefault({0, 0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling3d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/pool_op.cu b/paddle/operators/pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b80868f7b9d1697d619889160856d65ad59a3
--- /dev/null
+++ b/paddle/operators/pool_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2bc358def42959f2cc8f61cb00436fae1b7514b
--- /dev/null
+++ b/paddle/operators/pool_op.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+                pool2d_backward;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+                pool3d_backward;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 80de229c333f645fb3098b97fa076c6b77bb7ca9..04c4c24951f5db572486ded5edfc26948a821682 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -30,36 +30,39 @@ using LoDTensor = framework::LoDTensor;
 
 void RecurrentAlgorithm::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
-  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
+  PADDLE_ENFORCE_GT(seq_len, 0);
 
-  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
-    // create output alias variables
+  CreateScopes(scope, seq_len);
+  auto& step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
+  InitMemories(step_scopes[0]);
+
+  for (size_t step_id = 0; step_id < seq_len; step_id++) {
     if (step_id > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
-                        false /*infer_shape_mode*/);
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
 }
 
-void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
+void RecurrentAlgorithm::CreateScopes(const Scope& scope,
+                                      size_t seq_len) const {
   // TODO(superjom) Only two scopes are needed for inference, this case will be
   // supported later.
-  auto step_scopes_var = scope.FindVar(arg_->step_scopes);
+  auto* step_scopes_var = scope.FindVar(arg_->step_scopes);
   PADDLE_ENFORCE(step_scopes_var != nullptr, "");
-  auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto* step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
 
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
   PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
 
-  if (seq_len_ > step_scopes->size()) {
-    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+  if (seq_len > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len; ++i) {
       auto& step_scope = scope.NewScope();
 
       // create step net's temp inputs
@@ -82,8 +85,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   }
 }
 
-void RecurrentAlgorithm::InitMemories(Scope* step_scope,
-                                      bool infer_shape_mode) const {
+void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
     auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
@@ -91,12 +93,9 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
                    attr.boot_var);
     auto* boot_mem =
         step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      pre_mem->Resize(boot_mem->dims());
-      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
-    } else {
-      pre_mem->ShareDataWith<float>(*boot_mem);
-    }
+    pre_mem->Resize(boot_mem->dims());
+    PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
+    pre_mem->ShareDataWith<float>(*boot_mem);
   }
 }
 
@@ -146,23 +145,23 @@ class RecurrentAlgorithmProtoAndCheckerMaker
 
 void RecurrentGradientAlgorithm::Run(
     const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
-  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
-    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-                        false /*infer_shape_mode*/);
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
+  auto& step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
+  for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
+    if (step_id != seq_len - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  LinkBootMemoryGradients(step_scopes[0], false);
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
+  LinkBootMemoryGradients(step_scopes[0]);
 }
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
-    Scope* step_scope, bool infer_shape_mode) const {
+    Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
     PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
                    "memory variable [%s] does not exists", attr.var);
@@ -171,11 +170,8 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
     auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
     auto* boot_mem_grad =
         step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      boot_mem_grad->Resize(mem_grad->dims());
-    } else {
-      boot_mem_grad->ShareDataWith<float>(*mem_grad);
-    }
+    boot_mem_grad->Resize(mem_grad->dims());
+    boot_mem_grad->ShareDataWith<float>(*mem_grad);
   }
 }
 
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index c6b9a5533eece9057449b5c875ddcb3cefe716f0..253d7e3284360ceaddce9ef5f8f9a3ea4793d740 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -48,7 +48,7 @@ class RecurrentAlgorithm {
    * NOTE the scopes are reused in both the forward and backward, so just
    * create once and expand its size if more steps need.
    */
-  void CreateScopes(const framework::Scope& scope) const;
+  void CreateScopes(const framework::Scope& scope, size_t seq_len) const;
 
   const std::vector<framework::Scope*>& GetStepScopes(
       const framework::Scope& scope) const {
@@ -56,12 +56,11 @@ class RecurrentAlgorithm {
                 ->GetMutable<std::vector<framework::Scope*>>();
   }
 
-  void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
+  void InitMemories(framework::Scope* step_scopes) const;
 
  private:
   std::unique_ptr<framework::OperatorBase>* stepnet_;
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
 };
 
 class RecurrentGradientAlgorithm {
@@ -86,8 +85,7 @@ class RecurrentGradientAlgorithm {
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const;
 
-  void LinkBootMemoryGradients(framework::Scope* step_scopes,
-                               bool infer_shape_mode) const;
+  void LinkBootMemoryGradients(framework::Scope* step_scopes) const;
 
  protected:
   inline const std::vector<framework::Scope*>& GetStepScopes(
@@ -98,7 +96,6 @@ class RecurrentGradientAlgorithm {
 
  private:
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
   std::unique_ptr<framework::OperatorBase>* stepnet_;
 };
 
@@ -123,6 +120,7 @@ class RecurrentOp : public framework::OperatorBase {
   void set_stepnet(std::unique_ptr<OperatorBase> net) {
     stepnet_ = std::move(net);
   }
+
   const OperatorBase& stepnet() const { return *stepnet_; }
 
   static const rnn::ArgumentName kArgName;
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f61c7fdda9f80c69745a9bc4569fcbc099630aa
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/rmsprop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RmspropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
+                   "Input(MeanSquare) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of RmspropOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(param_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(Momentum_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
+                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and grad input of RmspropOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+    ctx->SetOutputDim("MeanSquareOut", param_dim);
+  }
+};
+
+class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RmspropOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated");
+    AddInput("MeanSquare",
+             "(Tensor, default Tensor<float>)"
+             " The mean square value that gets updated");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) The moment that gets updated");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value");
+    AddOutput("MomentOut", "(Tensor) Output updated moment");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) Constant "
+                   "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddAttr<float>("decay",
+                   "(float, default 0.9) "
+                   "Discounting factor for coming gradient.")
+        .SetDefault(0.9f);
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+
+RMSprop
+
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
+MomentOut = momentum * Moment +
+            LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
+ParamOut = Param -  MomentOut
+
+The original slides that proposed RMSprop: Slide 29 of
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
+REGISTER_OP_CPU_KERNEL(rmsprop,
+                       ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52634a54816bcd5ad0ba82a56f1df95110112265
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/rmsprop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(rmsprop,
+                       ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bf2129010f994966d79ef11d5cec30159b47068
--- /dev/null
+++ b/paddle/operators/rmsprop_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class RmspropOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+    mean_square_out->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rho = ctx.Attr<float>("decay");
+    float momentum = ctx.Attr<float>("momentum");
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    mom_out.device(place) =
+        momentum * mom +
+        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    p_out.device(place) = p - mom_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a767009d2366e20d2ebd35f562b8df7d408f2d4e..ef317a71f12c6de974bd8715bb08122b761fae37 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -25,7 +25,7 @@ using LoDTensor = framework::LoDTensor;
 
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len) {
   PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
   for (size_t i = 0; i < inlinks.size(); ++i) {
     // global inputs
@@ -41,11 +41,9 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
           step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
-      if (!infer_shape_mode) {
-        // The input of operators of each step is Tensor here.
-        // Maybe need to modify Slice function.
-        *step_input = input->Slice<float>(j, j + 1);
-      }
+      // The input of operators of each step is Tensor here.
+      // Maybe need to modify Slice function.
+      *step_input = input->Slice<float>(j, j + 1);
       step_input->Resize(step_dims);
     }
   }
@@ -53,39 +51,35 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
 
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len) {
   for (size_t i = 0; i < outlinks.size(); i++) {
-    auto output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
+    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
     PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
                             outlinks[i]);
     LoDTensor* output = output_var->GetMutable<LoDTensor>();
 
-    if (infer_shape_mode) {
-      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-      PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-      f::DDim step_dims =
-          step_scope_var->template GetMutable<LoDTensor>()->dims();
-      std::vector<int64_t> dims_vec = vectorize(step_dims);
-      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(f::make_ddim(dims_vec));
-    } else {
-      output->mutable_data<float>(platform::CPUPlace());
-      for (size_t j = 0; j < seq_len; j++) {
-        LoDTensor* step_output =
-            step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-        // TODO(luotao02) data type and platform::DeviceContext() should set
-        // correctly
-        (output->Slice<float>(j, j + 1))
-            .CopyFrom<float>(*step_output, platform::CPUPlace());
-      }
+    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
+    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
+    f::DDim step_dims =
+        step_scope_var->template GetMutable<LoDTensor>()->dims();
+    std::vector<int64_t> dims_vec = vectorize(step_dims);
+    dims_vec.insert(dims_vec.begin(), seq_len);
+    output->Resize(f::make_ddim(dims_vec));
+    output->mutable_data<float>(platform::CPUPlace());
+    for (size_t j = 0; j < seq_len; j++) {
+      LoDTensor* step_output =
+          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
+      // TODO(luotao02) data type and platform::DeviceContext() should set
+      // correctly
+      (output->Slice<float>(j, j + 1))
+          .CopyFrom<float>(*step_output, platform::CPUPlace());
     }
   }
 }
 
 void LinkMemories(const std::vector<Scope*>& scopes,
                   const std::vector<rnn::MemoryAttr>& memories,
-                  const size_t step_id, const int offset,
-                  bool infer_shape_mode) {
+                  const size_t step_id, const int offset) {
   PADDLE_ENFORCE_LT(step_id, scopes.size(),
                     "step [%d] is out of range of step scopes' size [%d]",
                     step_id, scopes.size());
@@ -95,16 +89,13 @@ void LinkMemories(const std::vector<Scope*>& scopes,
       step_id + offset, scopes.size(),
       "offset [%d] is out of range, it must be less than (%d - %d)", offset,
       scopes.size(), step_id);
-  auto scope = scopes[step_id];
-  auto linked_scope = scopes[step_id + offset];
+  auto* scope = scopes[step_id];
+  auto* linked_scope = scopes[step_id + offset];
   for (auto& attr : memories) {
-    auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      mem->Resize(linked_mem->dims());
-    } else {
-      mem->ShareDataWith<float>(*linked_mem);
-    }
+    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
+    mem->Resize(linked_mem->dims());
+    mem->ShareDataWith<float>(*linked_mem);
   }
 }
 
@@ -115,11 +106,11 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->inlinks = op.Inputs(name.inlinks);
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto boot_memories =
+  auto& boot_memories =
       is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
   // attributes
-  auto memories = op.Attr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
+  auto& memories = op.Attr<std::vector<std::string>>(name.memories);
+  auto& pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
                  "the size of memories, boot_memories don't match:%d,%d",
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 9c777f1e9067a3e2ceb9d23f7bf7d3c73343c91f..fd17b9b88915cf458ff2836b5c5d8f84cd9b65b5 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -64,18 +64,18 @@ struct ArgumentName {
  */
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len);
 
 /**
  * Process outputs of step nets and merge to variables.
  */
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len);
 
 void LinkMemories(const std::vector<Scope*>& step_scopes,
                   const std::vector<MemoryAttr>& memories, const size_t step_id,
-                  const int offset, bool infer_shape_mode);
+                  const int offset);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
                   const framework::OperatorBase& op, bool is_grad = false);
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
deleted file mode 100644
index 1fcf0959dffd6a68d97dec4e2b5b509d06c0d09c..0000000000000000000000000000000000000000
--- a/paddle/operators/rowwise_add_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rowwise_add_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class RowwiseAddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("b"),
-                   "Input(b) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RowwiseAddOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-
-    int num_col_dims = x_dims.size() - b_dims.size();
-
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Out").size(), 1,
-                      "The output size must be 1");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RowwiseAddOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left input of row-wise add op, must be matrix");
-    AddInput("b", "The right input of row-wise add op, must be vector");
-    AddOutput("Out", "The output of row-wise add op");
-    AddComment(R"DOC(Row-wise Add operator
-
-for i in xrange(X.shape[0]):
-  Out = X[i] + b
-)DOC");
-  }
-};
-class RowwiseAddGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("b"), "b should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-
-    int64_t num_col_dims = x_dims.size() - b_dims.size();
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    auto x_grad_name = framework::GradVarName("X");
-    auto b_grad_name = framework::GradVarName("b");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(b_grad_name)) {
-      ctx->SetOutputDim(b_grad_name, b_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            rowwise_add_grad, ops::RowwiseAddGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
deleted file mode 100644
index b43e5d868b38350a74ca1a94880990da6d7da0bc..0000000000000000000000000000000000000000
--- a/paddle/operators/rowwise_add_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-class RowwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-    auto input =
-        EigenMatrix<T>::Reshape(*context.Input<Tensor>("X"), num_col_dims);
-    auto bias = EigenVector<T>::Flatten(*context.Input<Tensor>("b"));
-    auto output = EigenMatrix<T>::Reshape(*out, num_col_dims);
-
-    const int bias_size = bias.dimension(0);
-    const int rest_size = input.size() / bias_size;
-    Eigen::DSizes<int, 1> one_d(input.size());
-    Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
-        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
-  }
-};
-
-template <typename Place, typename T>
-class RowwiseAddGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-
-    auto out_grad = EigenMatrix<T>::Reshape(*dout, num_col_dims);
-    auto place = context.GetEigenDevice<Place>();
-
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      EigenMatrix<T>::Reshape(*dx, num_col_dims).device(place) = out_grad;
-    }
-
-    if (db) {
-      db->mutable_data<T>(context.GetPlace());
-      // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-      // colwise add
-      Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
-      EigenVector<T>::Flatten(*db).device(place) = out_grad.sum(dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index e92501e12834b92875f494de401672344f50e3b5..e225aecc270bc17c535c10253c970b888c42e5d3 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -41,8 +41,8 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddInput("X", "The input tensor of scale operator.");
+    AddOutput("Out", "The output tensor of scale operator.");
     AddComment(R"DOC(Scale operator
 
 The equation is: Out = scale*X
@@ -52,21 +52,18 @@ The equation is: Out = scale*X
   }
 };
 
-// The operator to calculate gradients of a scale operator is just the scale
-// operator itself.
-// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
-template <typename AttrType>
-class ScaleGradOp : public NetOp {
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
-  ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
-        {{"Out", {Output(framework::GradVarName("X"))}}},
-        {{"scale", Attr<AttrType>("scale")}}));
-    CompleteAddOp(false);
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
 
@@ -75,7 +72,7 @@ class ScaleGradOp : public NetOp {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
-            ops::ScaleGradOp<float>);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
+                  ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..d95436be4f25b9df4aaef57ddb249ecf944f0666
--- /dev/null
+++ b/paddle/operators/scatter.cu.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
+                                  T* output, size_t index_size,
+                                  size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new updated tensor from source tensor, scatter-assigned according to
+ * index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index 6b542675c291607b35f180123cf42fee6a783a85..c1fb844ebd2ff7ca7dbdb8e8ac3c1fff4c0c6607 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -24,67 +24,42 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-// Implementation of CPU copy
-template <typename T>
-void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
-                      const size_t index_size,
-                      paddle::framework::Tensor* output) {
-  paddle::framework::DDim output_dims = output->dims();
-
-  for (size_t i = 0; i < index_size; ++i) {
-    int index_ = index[i];
-
-    paddle::framework::Tensor src_ = *src;
-    paddle::framework::Tensor output_ = *output;
-    if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
-    if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
-
-    auto X = EigenVector<T>::Flatten(src_);
-    auto Y = EigenVector<T>::Flatten(output_);
-
-    Y = X + Y;
-  }
-}
-
-// Implementation of GPU scatter:
-template <typename T>
-void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
-                      const int index_size, T* output);
 
 /**
  * Return a updated tensor from source tensor, scattered according to index:
- * dst[i] += src[index[i]]
+ * dst[i] = src[index[i]]
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void ScatterUpdate(const platform::Place& place,
-                   const paddle::framework::Tensor* src,
-                   const paddle::framework::Tensor* index,
-                   paddle::framework::Tensor* output) {
+void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                   const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   auto dst_dims = output->dims();
 
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
 
   // slice size
   size_t slice_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
 
-  if (platform::is_cpu_place(place)) {
-    CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
-  } else {
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
   }
 }
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index cadd8841b6ab3a3674054240265eb6d4b474db1e..d15ba151539987c133ac57e102df53551483c6dd 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -97,8 +97,5 @@ Out[Index] = Ref[Index] + Updates
 namespace ops = paddle::operators;
 REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
             ops::ScatterGradOp);
-REGISTER_OP_CPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    scatter_grad,
-    ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06f4d759447b6dcd28b50576dfc246fc466d9336
--- /dev/null
+++ b/paddle/operators/scatter_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    Out->ShareDataWith<T>(*Ref);
+
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith<T>(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates = dO[Index]
+    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index a8eb54399a932913de208e1ddc90a6ff0dfaa452..6101219006414e4865f676e3ca5d2a88949ad17a 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -23,10 +23,12 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename T>
 class ScatterOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *Ref = ctx.Input<Tensor>("Ref");
     auto *Index = ctx.Input<Tensor>("Index");
     auto *Updates = ctx.Input<Tensor>("Updates");
@@ -35,14 +37,16 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = Ref, Out[Index] += Updates
     Out->ShareDataWith<T>(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class ScatterGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
     auto *Index = ctx.Input<Tensor>("Index");
@@ -52,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index 26fdaff1460a297fa638181641991f732533fe52..00dbdacbfef7af826790472acc6caa285c259e0e 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) {
 
   float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
 
-  ScatterUpdate<float>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  ScatterAssign<float>(ctx, *src, *index, output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 17685ea654715f6996e17f6228f266c3aa1ee424..bc4af2f70427e684dfb531b8c61d68f28ae20794 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -24,9 +24,9 @@ class SequencePoolOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContextBase* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceAvgPoolOp should not be null.");
+                   "Input(X) of SequencePoolOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceAvgPoolOp should not be null.");
+                   "Output(Out) of SequencePoolOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
 };
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..621779ab6133f56a43fb2d20c814ebed8762ea7d
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSoftmaxOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSoftmaxOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
+    AddComment(R"DOC(
+SequenceSoftmaxOp computes softmax activation among all time-steps for each
+sequence. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
+lengths.
+
+Equation:
+    for i-th sequence in a mini-batch:
+        Out(X[lod[i]:lod[i+1]], :) =
+            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
+and N turns out to be 7.
+)DOC");
+  }
+};
+
+class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+            ops::SequenceSoftmaxGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/framework/grad_op_builder.h b/paddle/operators/sequence_softmax_op.cu
similarity index 63%
rename from paddle/framework/grad_op_builder.h
rename to paddle/operators/sequence_softmax_op.cu
index b601406061f9f8f24302251c2144b07b6e65717f..f2a1e3d5e31ef21b95a51b287bdd1d4aa9221e89 100644
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/operators/sequence_softmax_op.cu
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#define EIGEN_USE_GPU
 
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/sequence_softmax_op.h"
 
-namespace paddle {
-namespace framework {
-
-OperatorBase* BuildGradOp(const OperatorBase* op);
-
-void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op);
-
-}  // namespace framework
-}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d87c404d217280d74bd088e7a23f539ef6e7ce
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class SequenceSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = x->lod();
+    auto dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice<T>(start_pos, end_pos);
+      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice<T>(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice<T>(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
+                                           &out_grad_i, &x_grad_i);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 3bce95535cf10c0df95b503c6e362b3f0ba2e723..31d491f130e36289f7dd3dc18710d35d5ba6ab34 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -23,17 +23,22 @@ class SGDOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContextBase *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("param"),
-                   "Input(param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("grad"),
-                   "Input(grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("param_out"),
-                   "Output(param_out) of SGDOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of SGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
                       "Two input of SGD Op's dimension must be same.");
-    ctx->SetOutputDim("param_out", param_dim);
+    ctx->SetOutputDim("ParamOut", param_dim);
   }
 };
 
@@ -41,10 +46,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("param", "input parameter");
-    AddInput("grad", "input gradient");
-    AddOutput("param_out", "output parameter");
-    AddAttr<float>("learning_rate", "learning rate of sgd");
+    AddInput("Param", "Input parameter");
+    AddInput("LearningRate", "Learning rate of SGD");
+    AddInput("Grad", "Input gradient");
+    AddOutput("ParamOut", "output parameter");
     AddComment(R"DOC(
 
 Simplest sgd algorithm.
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index a3fe3308942f98e2c28376b589b6fc930e6878a1..26f4012f258771794c736dbfad4af174b017f410 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -19,28 +19,25 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename Place, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param = ctx.Input<Tensor>("param");
-    auto grad = ctx.Input<Tensor>("grad");
-    auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.Attr<float>("learning_rate");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
-    auto p = EigenVector<T>::Flatten(*param);
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto o = EigenVector<T>::Flatten(*param_out);
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto o = framework::EigenVector<T>::Flatten(*param_out);
+    auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
     auto place = ctx.GetEigenDevice<Place>();
 
-    o.device(place) = p - lr * g;
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    o.device(place) = p - lr.broadcast(grad_dsize) * g;
   }
 };
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 8fdda8b1dfc5dd40315682388dabe0bf2f2be555..2c08853f4f615bfe95f51aa20776ddddcdaa8f61 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -29,8 +29,8 @@ template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto X = context.Input<Tensor>("X");
-    auto Y = context.Output<Tensor>("Y");
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Output<Tensor>("Y");
 
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
@@ -43,29 +43,14 @@ template <typename Place, typename T>
 class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto Y = context.Input<Tensor>("Y");
-    auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
-    auto dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    const int batch_size = Y->dims()[0];
-    const int class_num = Y->dims()[1];
-
-    Eigen::DSizes<int, 1> along_class(1);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, class_num);
+    auto* Y = context.Input<Tensor>("Y");
+    auto* dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
 
-    auto Y_eigen = EigenMatrix<T>::From(*Y);
-    auto dY_eigen = EigenMatrix<T>::From(*dY);
-    auto dX_eigen = EigenMatrix<T>::From(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
 
-    auto dot = (Y_eigen * dY_eigen)
-                   .sum(along_class)
-                   .eval()
-                   .reshape(batch_by_one)
-                   .broadcast(one_by_class);
-    dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
+    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index a76489871f30dc8d852b6a783efeff41704fd4a4..42c1ba6fdf1351c43ef78efaaf05c54acb54ce94 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
 #include <paddle/function/TensorType.h>
+#include <iostream>
 
 namespace paddle {
 namespace operators {
@@ -27,15 +28,14 @@ class SoftmaxWithCrossEntropyOpMaker
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
-             "and K is the class number.")
-        .NotInGradient();
-    AddInput(
-        "Label",
-        "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-        "tensor. "
-        "If softLable is set to 0, Label is a Tensor<int> with shape [N x 1]. "
-        "If softLable is set to 1, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+             "and K is the class number.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
+             "tensor. "
+             "If softLable is set to 0, Label is a Tensor<int> with shape [N x "
+             "1]. "
+             "If softLable is set to 1, Label is a Tensor<float/double> "
+             "with shape [N x K].");
     AddOutput(
         "Softmax",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -163,15 +163,34 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* grad_op = new framework::OpDescBind();
+    grad_op->SetType("softmax_with_cross_entropy_grad");
+    grad_op->SetInput("Label", Input("Label"));
+    grad_op->SetInput("Softmax", Output("Softmax"));
+    grad_op->SetInput("Loss", Output("Loss"));
+    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
+    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
-            ops::SoftmaxWithCrossEntropyOpMaker,
-            softmax_with_cross_entropy_grad,
-            ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
+REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
+                  ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
                        ops::SoftmaxWithCrossEntropyKernel<float>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 05882a88738cfc9cc23480efe0afe504008377ca..68f064eaee5851333ddf9767b7138da83a28503d 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(StridedMemcpy, GPUCrop) {
   // clang-format off
   int src[] = {
@@ -157,4 +157,4 @@ TEST(StridedMemcpy, GPUConcat) {
 
 #endif
 }  // namespace operators
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 8f62a9f4db8d39edc11949df513aebf4fa257d45..c701ee8dde26f0fd50fae227d3f345df2d7a119d 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/operators/sum_op.h"
 #include <vector>
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -22,14 +23,15 @@ class SumOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
     auto x_dims = ctx->GetInputsDim("X");
-    PADDLE_ENFORCE(!x_dims.empty(), "Input(X) of SumOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SumOp should not be null.");
 
-    auto in_dim = x_dims[0];
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+
+    auto in_dim = x_dims[0];
     for (size_t i = 1; i < N; i++) {
       auto dim = x_dims[i];
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
@@ -54,21 +56,26 @@ or not. But the output only shares the LoD with the first input.
   }
 };
 
-class SumGradOp : public framework::OperatorWithKernel {
+class SumGradMaker : public framework::GradOpDescMakerBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_grad_names = ctx->Outputs(framework::GradVarName("X"));
-    size_t x_length = x_grad_names.size();
-    std::vector<framework::DDim> x_grad_dims;
-    x_grad_dims.reserve(x_length);
-    for (size_t i = 0; i < x_length; ++i) {
-      x_grad_dims.push_back(out_grad_dims);
-    }
-    ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims);
+  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+      const override {
+    auto x_grads = InputGrad("X");
+    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+    grad_ops.reserve(x_grads.size());
+    auto og = OutputGrad("Out");
+    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
+                   [&og](const std::string& x_grad) {
+                     auto* grad_op = new framework::OpDescBind();
+                     grad_op->SetType("scale");
+                     grad_op->SetInput("X", og);
+                     grad_op->SetOutput("Out", {x_grad});
+                     grad_op->SetAttr("scale", 1.0f);
+                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                   });
+    return grad_ops;
   }
 };
 
@@ -76,7 +83,6 @@ class SumGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sum, ops::SumOp, ops::SumOpMaker, sum_grad, ops::SumGradOp);
+
+REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index a465cf3659ba7c51338abadfc62962fb6755a39d..b1896d3cd87f47bd2573287ee37b1b72ae9ec6e8 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -14,5 +14,3 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 7e8fbb9e41c694df9169ea583ce47c33d3bcf2bb..91e5da8b40d452db8715990cdbe2731b3aea44b9 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto outs = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (auto out : outs) {
-      out->mutable_data<T>(context.GetPlace());
-    }
-
-    auto place = context.GetEigenDevice<Place>();
-    auto in = EigenVector<T>::Flatten(*input);
-    for (auto out : outs) {
-      auto result = EigenVector<T>::Flatten(*out);
-      result.device(place) = in;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 36af1ac677f6bb3e5b6392ff0de678afe7e47950..a9b6b799036a4f2ba93ef52398131db4fcb599f5 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -35,7 +35,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 template <>
 Eigen::GpuDevice*
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index d805d2ab085f76e119edf1c6f2acb9715883d755..ef5f19214d9ccb23b9c946bee28cb764122bd7cd 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/gpu_info.h"
@@ -61,7 +61,7 @@ class CPUDeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 struct EigenDeviceConverter<platform::GPUPlace> {
   using EigenDeviceType = Eigen::GpuDevice;
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index f4b00c57dee5196e535816d8985fd7e831c4c226..8bf5174c4a5579f6f5602dd38e5a87ed3ef444a7 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -20,7 +20,7 @@ TEST(Device, Init) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::GPUPlace;
 
-  int count = paddle::platform::GetDeviceCount();
+  int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device =
@@ -34,7 +34,7 @@ TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::GPUPlace;
 
-  int count = paddle::platform::GetDeviceCount();
+  int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index b523ef03c0053622bfda5b4bf07515c1b480b4af..15d8446cd8dceb2fdc03536e1f7bbcde73403a23 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
@@ -113,7 +113,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
@@ -185,7 +185,7 @@ inline void throw_on_error(T e) {
         std::make_exception_ptr(                                       \
             std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
         __FILE__, __LINE__);                                           \
-  } while (0)
+  } while (false)
 
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
@@ -195,7 +195,7 @@ inline void throw_on_error(T e) {
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (0)
+  } while (false)
 
 /*
  * Some enforce helpers here, usage:
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 80bdee3d9dfbe38ef707a6ba60cdb7f7b99714de..8206a055eabf4abf584962e921610d5029e2f571 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -213,4 +213,4 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index be381a4e26cf0eb41f5b3de88bd03ad8901683cc..70ad611d5dd61937e6bf7d980e34b5c9023977b2 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -26,11 +26,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
 namespace paddle {
 namespace platform {
 
-int GetDeviceCount() {
+int GetCUDADeviceCount() {
   int count;
   PADDLE_ENFORCE(
       cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
   return count;
 }
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index f0c825bd9b0bc41396b8fdb95f0b4337cbe3db02..fb33db07bd54d37dec2e5d687ecefb01cc330e44 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <stddef.h>
@@ -28,7 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse =
     "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 
 //! Get the total number of GPU devices in system.
-int GetDeviceCount();
+int GetCUDADeviceCount();
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
diff --git a/paddle/platform/hostdevice.h b/paddle/platform/hostdevice.h
index e7de86b7b2f75d206e730ec409bbee5d0a08942e..eb2df291cceef553d6422e6166e1fef2c63e2a47 100644
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
@@ -2,8 +2,10 @@
 
 #ifdef __CUDACC__
 #define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
 #define HOST __host__
 #else
 #define HOSTDEVICE
+#define DEVICE
 #define HOST
 #endif
diff --git a/paddle/platform/macros.h b/paddle/platform/macros.h
index 4a04a38c0c6f905639004dea2f4416ecc57c8620..feae7bdd77e3a0d02f33fb33991648408f542d0e 100644
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname&) = delete;    \
-  classname& operator=(const classname&) = delete
+#define DISABLE_COPY_AND_ASSIGN(classname)         \
+ private:                                          \
+  classname(const classname&) = delete;            \
+  classname(const classname&&) = delete;           \
+  classname& operator=(const classname&) = delete; \
+  classname& operator=(const classname&&) = delete
 #endif
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 16ee00efe7a9b0406f8459e19a55e1e1b9ca7419..619897ca19eb2e6f4dbfd9160edf8c4bc58c89a9 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -16,7 +16,7 @@
 
 #include <boost/config.hpp>
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 // Because boost's variadic templates has bug on nvcc, boost will disable
 // variadic template support when GPU enabled on nvcc.
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6f6c9e596cfb7a2547d5b6c5de69381eb9c29132..b43461d61bab21747e85090bbf7af21a87a670c6 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
 
   uint64_t dataSize = FLAGS_dim * sizeof(real);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuVector gpuParam(FLAGS_dim);
   GpuVector gpuGrad(FLAGS_dim);
 #else
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 04236fda2fb62b928b5c06ff38acfd3eb7217b08..ad8ffed9c1c8e4bdef27689ab21950db6b5cf0a2 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
 }
 
 TEST(ProtoServer, extended) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   ProtoClient* client;
   if (FLAGS_rdma_tcp == "rdma")
     client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 18ecbd1aa34c82d63ae7f8ec1bd8f81b35eee30b..97364f2db9523c0629616692631d8372657a2128 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc
+    DEPS pybind python backward proto_desc tensor_array
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index f4121e9d71824296770f86c1e94c096f767dec0a..356c4986e2e182e904215f7ebb8cac5146364f8b 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@@ -34,7 +35,7 @@ static size_t UniqueIntegerGenerator() {
 }
 
 bool IsCompileGPU() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
@@ -78,7 +79,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
@@ -96,7 +97,7 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             new (&instance) LoDTensor(lod);
 #else
              LoD new_lod;
@@ -107,7 +108,7 @@ PYBIND11_PLUGIN(core) {
           })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
              self.set_lod(lod);
 #else
              LoD new_lod;
@@ -117,7 +118,7 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
         return self.lod();
 #else
            auto lod = self.lod();
@@ -143,6 +144,13 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_int",
            [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
       .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
+      .def("set_float",
+           [](Variable &var, float val) -> void {
+             *var.GetMutable<float>() = val;
+           })
+      .def("get_float",
+           [](const Variable &var) -> float { return var.Get<float>(); })
       .def("get_tensor",
            [](Variable &self) -> LoDTensor * {
              return self.GetMutable<LoDTensor>();
@@ -196,7 +204,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::GPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
                     PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
@@ -223,6 +231,21 @@ All parameter, weight, gradient are variables in Paddle.
                                    desc.InitializationErrorString());
                     return OpRegistry::CreateOp(desc);
                   })
+      .def_static("infer_shape",
+                  [](OpDescBind &op_desc, BlockDescBind &block) {
+                    auto op = OpRegistry::CreateOp(*op_desc.Proto());
+                    auto *op_with_kernel =
+                        dynamic_cast<OperatorWithKernel *>(op.get());
+                    if (op_with_kernel != nullptr) {
+                      auto ctx = CompileTimeInferShapeContext(op_desc, block);
+                      op_with_kernel->InferShape(&ctx);
+                    } else {
+                      PADDLE_THROW(
+                          "OP(%s) is not type of OperatorWithKernel, "
+                          "should not call this function",
+                          op_desc.Type());
+                    }
+                  })
       .def("backward",
            [](const OperatorBase &forwardOp,
               const std::unordered_set<std::string> &no_grad_vars) {
@@ -264,6 +287,56 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
+  py::class_<framework::TensorArray>(m, "TensorArray")
+      .def("__init__",
+           [](TensorArray &instance) { new (&instance) TensorArray(); })
+      .def("read",
+           [](TensorArray &self, size_t index) { return self.Read(index); })
+      .def("write", [](TensorArray &self, size_t index,
+                       LoDTensor &value) { self.Write(index, value); })
+      .def("write_shared",
+           [](TensorArray &self, size_t index, const LoDTensor &value) {
+             self.WriteShared(index, value);
+           })
+      .def("size", [](TensorArray &self) { return self.size(); })
+      .def("pack",
+           [](TensorArray &self, size_t level,
+              const std::vector<std::vector<size_t>> &meta_info,
+              const std::vector<std::vector<size_t>> &lod) {
+             std::vector<DySeqMeta> meta;
+             for (auto &info : meta_info) {
+               PADDLE_ENFORCE_EQ(info.size(), 3UL);
+               meta.emplace_back(info[0], info[1], info[2]);
+             }
+#ifndef PADDLE_WITH_CUDA
+             return self.Pack(level, meta, lod);
+#else
+             LoD new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return self.Pack(level, meta, new_lod);
+#endif
+           })
+      .def("unpack",
+           [](TensorArray &self, const LoDTensor &source, int level,
+              bool length_descend) {
+             auto metas = self.Unpack(source, level, length_descend);
+             std::vector<std::vector<size_t>> meta_info;
+             for (auto meta : metas) {
+               meta_info.emplace_back(
+                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
+             }
+             return meta_info;
+           })
+      .def("stack", [](TensorArray &self) { return self.Stack(); })
+      .def("unstack",
+           [](TensorArray &self, const LoDTensor &source) {
+             return self.Unstack(source);
+           })
+      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
+        return self.UnstackShared(source);
+      });
+
   // recurrent_op
   py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp")
       .def_static(
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 3e3e6bc0312974fab50e17d428c7dea9ca547d1e..9e73f79cbdd545db558bd8641bc52e4bf3b0664f 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -106,7 +106,7 @@ void PyCPUTensorSetFromArray(
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 26f9c0fcd4e045f5d603fc4e4b16691a418823ca..5c4b5a2495182ea5d2b3341cff650dfb4d8b0c0f 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,7 +18,7 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
-        echo "    with_mkldnn: @WITH_MKLDNN"
+        echo "    with_mkldnn: @WITH_MKLDNN@"
         echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 542c771a98ec8ae187cd4f821ed1ee4373427041..971484dd0c073762e99f3926576eb21b96197769 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -36,4 +36,4 @@ TEST(to_string, user_defined) {
   using namespace paddle::string;
   UserDefinedClass instance;
   ASSERT_EQ(kOutputString, to_string(instance));
-}
\ No newline at end of file
+}
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 91d89b61a32259b8bbe70fda2579f87ec6b9af00..6c52eaf4494bb247324b29981d94d7e97e0f212a 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -29,7 +29,7 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
   string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;
 #endif
   auto config = std::make_shared<TrainerConfigHelper>(confFile);
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index e855a8fe2e09aa0f16a73f3e7bcc2f32921092f8..f3a964acb69be059a43470f7b68910a3b6cecaab 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
 }
 
 int main(int argc, char** argv) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   exit(0);
 #endif
   paddle::initMain(argc, argv);
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 813275518e411d6e963e23df634541f771096e0f..5f1834bd730375fc10762fc19788d0c693f8e752 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 264bc46ebcd0aa17fd605e537fcb2c316ef31162..425b3d10a38086463784ba2a18db1293efe96e92 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile,
 
 TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
 
 TEST(checkGradient, multiGpu) {
@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
   checkGradientTest(configFile3, false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   checkGradientTest(configFile3, true, true);
 #endif
 }
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 00ba61377aeff17d82e03f7560c0d71b3570d14f..b2a93d4d5eea37ad716b59427f2aa4409d2f537d 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
 // 1. test trainer (cpu, gpu).
 TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
 
 TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) {
 #endif
 
 // 2. test average_window.
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(average_window, gpu) {
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
 }
@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkRemoteUpdater, gpuTrainer) {
   checkRemoteParameterUpdaterTest(configFile1, true, false);
 }
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 1322e77178a4f5674f41943f886a17be8337bd75..a8fbe31c2b1e228107dfc19483444409bfcbf788 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
 #ifndef PADDLE_TYPE_DOUBLE
 
 TEST(RecurrentGradientMachine, test_generation) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   const auto useGpuConfs = {false};
 #else
   const auto useGpuConfs = {true, false};
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index ab1c181c62cdbee8cc5f804ec9aaf63ac5464ad6..8f100f02e90bcbc7fdcf6f053aec6f95cfb09c1a 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Flags.h"
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 22ce2534d3468ded36221810aa61c15b37f13f3d..9579881ea3b92abab0189631184bab515afb67a3 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -218,7 +218,7 @@ protected:
  * *d2* is peer device to enable direct access to by the d1 device.
  */
 inline void enablePeerAccess(int d1, int d2) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (hl_device_can_access_peer(d1, d2)) {
     SetDevice dev(d1);
     hl_device_enable_peer_access(d2);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index f53d6420bbbdf66f8f355af95c6b11c30a3bfab9..004d62451cddfee8fbd687938086e04ecb2332a9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
  * @return return true if paddle compiled with GPU
  */
 constexpr bool isWithGpu() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 74025d2a7bb68f87afd24bb4b70ec425ba0dcb64..d37f29d2c4bf9177398ea82c99bc40affdd952c2 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -142,6 +142,7 @@ __all__ = [
     'img_pool3d_layer',
     'scale_shift_layer',
     'img_conv3d_layer',
+    'resize_layer',
 ]
 
 
@@ -250,6 +251,8 @@ class LayerType(object):
     KMAX_SEQ_SCORE = 'kmax_seq_score'
     SCALE_SHIFT_LAYER = 'scale_shift'
 
+    RESIZE = 'resize'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6473,7 +6476,7 @@ def switch_order_layer(input,
                        act=None,
                        layer_attr=None):
     """
-    This layer switch dimension order of image input. 
+    This layer switch dimension order of image input.
     From order "batchSize, channels, height, width"
     to order "batchSize, height, width, channels".
 
@@ -6932,3 +6935,23 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
         bias=ParamAttr.to_bias(bias_attr))
     return LayerOutput(
         name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default("resize")
+def resize_layer(input, size, name=None):
+    """
+    The resize layer resizes the input matrix with a shape of [Height, Width]
+    into the output matrix with a shape of [Height x Width / size, size],
+    where size is the parameter of this layer indicating the output dimension.
+
+    :param input: The input to this layer.
+    :type input: LayerOutput.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param size: The resized output dimesion of this layer.
+    :type size: int
+    :return: A LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
+    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 8a204a96f3ef57673cef65306d0bf8e8c3409751..6a4550c209762362d40f8a2afaf526a1fe53ca6b 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..9399252b23d0ec0cce918196bf4077a51e757eaf
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__resize_0__"
+  type: "resize"
+  size: 150
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__resize_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__resize_0__"
+  input_layer_names: "input"
+  output_layer_names: "__resize_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09a6f507338c1da8e9ce60555f8ca2576704170c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+resized = resize_layer(input=data, size=150)
+
+outputs(resized)
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 23794151bdb303394d6342ce8089d46d75425106..81067f38bbf64ac1ab4ccf02aa43b0a38b7d48ad 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -1,5 +1,6 @@
 import unittest
 import numpy as np
+import random
 import itertools
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
@@ -45,12 +46,17 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 def set_input(scope, op, inputs, place):
     def __set_input__(var_name, var):
-        tensor = scope.find_var(var_name).get_tensor()
-        if isinstance(var, tuple):
-            tensor.set_lod(var[1])
-            var = var[0]
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
 
     for in_name, in_dup in Operator.get_op_inputs(op.type()):
         if in_name in inputs:
@@ -192,6 +198,21 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 
 
 class OpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        '''Fix random seeds to remove randomness from tests'''
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+
+        np.random.seed(123)
+        random.seed(124)
+
+    @classmethod
+    def tearDownClass(cls):
+        '''Restore random seeds'''
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
     def check_output_with_place(self, place, atol):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index c44eb849063592fbda417ec1516d195dd4358612..701e1a1aeec2746643fbd5432dadfd6bc46f358f 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -48,6 +48,21 @@ class TestTanh(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestTanhShrink(OpTest):
+    def setUp(self):
+        self.op_type = "tanh_shrink"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
+        }
+        self.outputs = {'Y': self.inputs['X'] - np.tanh(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
 class TestSqrt(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
@@ -122,6 +137,23 @@ class TestBRelu(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.02)
 
 
+class TestLeakyRelu(OpTest):
+    def setUp(self):
+        self.op_type = "leaky_relu"
+        alpha = 0.02
+        self.attrs = {'alpha': alpha}
+        self.inputs = {'X': np.random.uniform(-3, 3, [4, 4]).astype("float32")}
+        self.outputs = {
+            'Y': np.maximum(self.inputs['X'], alpha * self.inputs['X'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestSoftRelu(OpTest):
     def setUp(self):
         self.op_type = "soft_relu"
diff --git a/python/paddle/v2/framework/tests/test_adadelta_op.py b/python/paddle/v2/framework/tests/test_adadelta_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7105593a98aee9885ba16e3ee0649a6024033ee7
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdadeltaOp1(OpTest):
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        self.attrs = {'rho': rho, 'epsilon': epsilon}
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdadeltaOp2(OpTest):
+    '''Test Adadelta op with default attribute values
+    '''
+
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/framework/tests/test_adagrad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bad349e59b608cb3cc965401c81ef4c716b318
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
@@ -0,0 +1,69 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdagradOp1(OpTest):
+    ''' Test Adagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdagradOp2(OpTest):
+    ''' Test Adagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_add_op.py b/python/paddle/v2/framework/tests/test_add_op.py
deleted file mode 100644
index 3ca34d9b9fc2b7b54cc25ca0e0d1a08a71e37c52..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_add_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "add"
-        self.inputs = {
-            'X': np.random.random((102, 105)).astype("float32"),
-            'Y': np.random.random((102, 105)).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
index e7a506f2775a3f1edbacceb91e84ad49a9db67c0..76323b5e10c59822b4de82a70ebd57b3e57c8392 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -15,7 +15,7 @@ class PySimpleCond(object):
         for i in range(1, 10, 2):
             array[i] = 0
         self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1))
+        self.x = np.ones(shape=(10, 1)).astype("float32")
 
     def forward(self):
         self.index_t = np.where(self.cond == 1)
@@ -112,7 +112,4 @@ class TestCondOp(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    exit(
-        0
-    )  # FIXME(yuyang18): Since infer_shape has been removed, cond op may error
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
deleted file mode 100644
index 85117bf9600975ea5d61dfb5b34335792bf6d8b2..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import unittest
-import numpy as np
-import paddle.v2.framework.core as core
-from op_test import get_numeric_gradient
-from op_test import create_op
-
-
-class GetNumericGradientTest(unittest.TestCase):
-    def test_add_op(self):
-        x = np.random.random((10, 1)).astype("float32")
-        y = np.random.random((10, 1)).astype("float32")
-        z = x + y
-        scope = core.Scope()
-        add_op = create_op(scope, "add", {'X': x, 'Y': y}, {'Out': z}, dict())
-        arr = get_numeric_gradient(scope, add_op, {'X': x,
-                                                   'Y': y}, 'X', ['Out'])
-        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
-
-    def test_softmax_op(self):
-        def stable_softmax(x):
-            """Compute the softmax of vector x in a numerically stable way."""
-            shiftx = x - np.max(x)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(Y.shape[0]):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-
-        X = np.random.random((2, 2)).astype("float32")
-        Y = np.apply_along_axis(stable_softmax, 1, X)
-        dY = np.ones(Y.shape)
-        dX = label_softmax_grad(Y, dY)
-
-        scope = core.Scope()
-        softmax_op = create_op(scope, "softmax", {"X": X}, {"Y": Y}, dict())
-
-        arr = get_numeric_gradient(scope, softmax_op, {"X": X}, "X", "Y")
-        np.testing.assert_almost_equal(arr, dX, decimal=1e-2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/framework/tests/test_infer_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..b38ec9c03740a2e69f1247c094ce56ab43fa8e32
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
@@ -0,0 +1,63 @@
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestInferShape(unittest.TestCase):
+    def test_sum_op(self):
+        prog = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        shape = [10, 20]
+
+        # prepare input/output
+        x1 = block.new_var("x1")
+        x1.set_shape(shape)
+        x2 = block.new_var("x2")
+        x2.set_shape(shape)
+
+        out = block.new_var("out")
+
+        # prepare the operator
+        sum_op_desc = block.append_op()
+        sum_op_desc.set_type("sum")
+        sum_op_desc.set_input("X", ["x1", "x2"])
+        sum_op_desc.set_output("Out", ["out"])
+
+        core.Operator.infer_shape(sum_op_desc, block)
+        self.assertEqual(out.shape(), shape)
+
+    def test_mul_op(self):
+        prog = core.ProgramDesc.__create_program_desc__()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        x_shape = [10, 20]
+        y_shape = [20, 30]
+
+        # prepare input/output
+        x1 = block.new_var("x")
+        x1.set_shape(x_shape)
+        x2 = block.new_var("y")
+        x2.set_shape(y_shape)
+
+        out = block.new_var("out")
+
+        # prepare the operator
+        mul_op_desc = block.append_op()
+        mul_op_desc.set_type("mul")
+        mul_op_desc.set_input("X", ["x"])
+        mul_op_desc.set_input("Y", ["y"])
+        mul_op_desc.set_output("Out", ["out"])
+        mul_op_desc.set_attr("x_num_col_dims", 1)
+        mul_op_desc.set_attr("y_num_col_dims", 1)
+
+        core.Operator.infer_shape(mul_op_desc, block)
+        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
index 8ce65bfc31d9fa2d3988759a197e2f497b8161b1..365ee560e14e322cd8cfcdc068a8b004f6e365ad 100644
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -14,8 +14,8 @@ def tanh_np(x):
 class LstmUnitTest(OpTest):
     def setUp(self):
         self.op_type = "lstm_unit"
-        x_np = np.random.normal(size=(5, 16)).astype("float32")
-        c_np = np.random.normal(size=(5, 4)).astype("float32")
+        x_np = np.random.normal(size=(5, 16)).astype("float64")
+        c_np = np.random.normal(size=(5, 4)).astype("float64")
         i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
         forget_bias_np = 0.
         self.attrs = {'forget_bias': 0.}
@@ -31,7 +31,7 @@ class LstmUnitTest(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01)
+        self.check_grad(['X', 'C_prev'], ['C', 'H'])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 50cfb855f2b01d8fd32342855d46716da7e07856..8503257feb8e1a5802f3f889f72c559a2aaa583a 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add", X="X", Y="Y", Out="Out")
+        op1 = Operator("sum", X=["X", "Y"], Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 040556322d79cbb594eb9af585a5b9920d7ab625..98f6b2f5ee639120557cb85b3ada6d2931f7d0d2 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add", X="a", Y="b", Out="z")
+        add_op = op.Operator("sum", X=["a", "b"], Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2941fda81b23998072810d8c6f6597a6f3db7e30
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -0,0 +1,144 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+
+
+class TestPool2d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase1(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase2(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase3(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase4(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase5(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8792b492e3da6541f71185be82b8bfc4f52d821d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -0,0 +1,152 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+
+
+def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
+                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+    return out
+
+
+class TestPool3d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase1(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase2(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase3(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase4(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase5(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 92161ae5dd93d34d898a2027435cc5e55611bcd0..1f114432c09f29fab6cd56de00dff341785ae0e4 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -16,14 +16,17 @@ class PySimpleRNN(object):
     '''
 
     def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11):
-        self.x = np.random.normal(size=(sent_len, batch_size, input_dim))
-        self.W = np.random.normal(size=(input_dim, input_dim))
-        self.U = np.random.normal(size=(input_dim, input_dim))
-        self.h_boot = np.random.normal(size=(batch_size, input_dim))
+        self.x = np.random.normal(size=(sent_len, batch_size,
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(batch_size,
+                                             input_dim)).astype("float32")
 
         # memories
         self.mems = [
-            np.zeros(shape=(batch_size, input_dim)) for i in range(sent_len)
+            np.zeros(shape=(batch_size, input_dim)).astype("float32")
+            for i in range(sent_len)
         ]
 
     def forward(self):
@@ -36,7 +39,7 @@ class PySimpleRNN(object):
         return [self.x[i] for i in range(self.x.shape[0])]
 
     def concat_outputs(self):
-        return np.array(self.mems)
+        return np.array(self.mems).astype("float32")
 
     def step(self, step_id, x):
         '''
@@ -47,8 +50,8 @@ class PySimpleRNN(object):
             pre_mem = self.mems[step_id - 1]
         else:
             pre_mem = self.h_boot
-        xW = np.matmul(x, self.W)
-        hU = np.matmul(pre_mem, self.U)
+        xW = np.matmul(x, self.W).astype("float32")
+        hU = np.matmul(pre_mem, self.U).astype("float32")
 
         sum = xW + hU
         self.mems[step_id] = py_sigmoid(sum)
@@ -102,7 +105,8 @@ class RecurrentOpTest(unittest.TestCase):
         self.create_step_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
         self.rnnop.run(self.scope, ctx)
-        return np.array(self.scope.find_var("h@mem").get_tensor())
+        return np.array(self.scope.find_var("h@mem").get_tensor()).astype(
+            "float32")
 
     def create_global_variables(self):
         # create inlink
@@ -142,7 +146,7 @@ class RecurrentOpTest(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@mem")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
@@ -179,7 +183,7 @@ class RecurrentGradientOpTest(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
@@ -197,7 +201,4 @@ class RecurrentGradientOpTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(yuyang18): InferShape has been removed, this unittest may error
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/framework/tests/test_rmsprop_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e5ff733e9b55fe8c9727e9721e25083a494be15
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
@@ -0,0 +1,89 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRmspropOp1(OpTest):
+    ''' Test RMSProp with explicit inputs
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1e-6
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestRmspropOp2(OpTest):
+    '''Test RMSProp with defaukt values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1.0e-10
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
deleted file mode 100644
index 336645bd993ff743cbe20bb5cae5cd278db57ce7..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRowwiseAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [10]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-
-    def test_check_grad_ingore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-
-
-class TestRowwiseAddOp2(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 2, 5]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [2, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-
-    def test_check_grad_ignore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/framework/tests/test_scatter_op.py
index 33c73c52631a09ea0fefdeb9467991ae9c04321c..1032269d5dfb02e3518b9ef2820d5d0dcc8a51a0 100644
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
@@ -10,7 +10,7 @@ class TestScatterOp(OpTest):
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
         output_np = np.copy(ref_np)
-        output_np[index_np] += updates_np
+        output_np[index_np] = updates_np
         self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
@@ -18,7 +18,7 @@ class TestScatterOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['Updates', 'Ref'], 'Out', in_place=True)
+        self.check_grad(['Updates'], 'Out', in_place=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54a56aa6d3f76baa4d1fc6ba8f963332deba002
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
@@ -0,0 +1,38 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSequenceSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_softmax"
+        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
+        lod = [[0, 4, 5, 8, 11]]
+
+        out = np.zeros((11, 1)).astype("float32")
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+            sub_out = stable_softmax(sub_x)
+            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
+                lod[0][i + 1] - lod[0][i], 1)
+
+        self.inputs = {"X": (x, lod)}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index 64e54d1500c1bc134cc1efe33d41a16dbc08f2d4..2dd881e5e107249277a91bd8e3a72567269e1cd4 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,11 +8,10 @@ class TestSGDOp(OpTest):
         self.op_type = "sgd"
         w = np.random.random((102, 105)).astype("float32")
         g = np.random.random((102, 105)).astype("float32")
-        lr = 0.1
+        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'param': w, 'grad': g}
-        self.attrs = {'learning_rate': lr}
-        self.outputs = {'param_out': w - lr * g}
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 428395b76c8fbcbc07b19ee1979419f0e64aca85..377d07fb5927a108e9bd39ab227da4f40a9cd447 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -43,7 +43,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 2
-        class_num = 17
+        class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
                                    [batch_size, class_num]).astype("float32")
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/framework/tests/test_tensor_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f8a01f9224fcbd6dd6cbc8c37cc81036ad3e07
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
@@ -0,0 +1,106 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestTensorArray(unittest.TestCase):
+    def setUp(self):
+        self.ta = core.TensorArray()
+
+        self.batch_size = 10
+        self.dim = 2
+
+        # create a LoDTensor
+        self.scope = core.Scope()
+        var = self.scope.new_var("test_tensor")
+        self.place = core.CPUPlace()
+        tensor = var.get_tensor()
+        tensor.set_dims([self.batch_size, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        tensor_array[0, 0] = 0
+        tensor_array[1, 0] = 1
+        tensor_array[2, 0] = 2
+        tensor_array[3, 0] = 3
+        tensor_array[4, 0] = 4
+        tensor_array[5, 0] = 5
+        tensor_array[6, 0] = 6
+        tensor_array[7, 0] = 7
+        tensor_array[8, 0] = 8
+        tensor_array[9, 0] = 9
+
+        lod_py = [[0, 2, 5, 10]]
+        lod_tensor = core.LoDTensor(lod_py)
+        lod_tensor.set(tensor_array, self.place)
+
+        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
+
+        self.tensor = lod_tensor
+
+    def test_unstack(self):
+        self.ta.unstack(self.tensor)
+        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
+
+    def test_read(self):
+        self.ta.unstack(self.tensor)
+        for i in range(self.batch_size):
+            tensor = self.ta.read(i)
+
+    def test_write(self):
+        self.ta.unstack(self.tensor)
+
+        # create a tensor with shape of [1, self.dim]
+        var = self.scope.new_var("hell")
+        tensor = var.get_tensor()
+        tensor.set_dims([1, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        for i in range(self.dim):
+            tensor_array[0, i] = i
+        tensor.set(tensor_array, self.place)
+
+        self.ta.write(2, tensor)
+
+        ta_tensor = self.ta.read(2)
+        ta_tensor_array = np.array(ta_tensor)
+        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
+        self.assertTrue((tensor_array == ta_tensor_array).all())
+
+    def test_write_shared(self):
+        self.ta.unstack(self.tensor)
+
+        # create a tensor with shape of [1, self.dim]
+        var = self.scope.new_var("hell")
+        tensor = var.get_tensor()
+        tensor.set_dims([1, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        for i in range(self.dim):
+            tensor_array[0, i] = i
+        tensor.set(tensor_array, self.place)
+
+        self.ta.write_shared(2, tensor)
+
+        ta_tensor = self.ta.read(2)
+        ta_tensor_array = np.array(ta_tensor)
+        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
+        self.assertTrue((tensor_array == ta_tensor_array).all())
+
+    def test_unpack(self):
+        meta = self.ta.unpack(self.tensor, 0, True)
+        self.assertEqual(self.ta.size(), 5)
+        self.assertEqual(meta, self.py_seq_meta)
+
+    def test_pack(self):
+        meta = self.ta.unpack(self.tensor, 0, True)
+        print "meta", meta
+        tensor = self.ta.pack(0, meta, self.tensor.lod())
+        print np.array(self.tensor)
+        print np.array(tensor)
+        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
+        self.assertTrue(tensor.lod(), self.tensor.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index e80456d9bbeb3c34ac9eab873a84dbf8f06e34df..9148cb56cf78e1ebb994f4a4a34d4a1b6e2e6ef4 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -96,6 +96,9 @@ class Inference(object):
             for i, item in enumerate(result):
                 retv[i].append(item)
 
+        if retv == None:
+            return []
+
         if flatten_result:
             retv = [numpy.concatenate(out) for out in retv]