Merge branch 'develop' of github.com:PaddlePaddle/Paddle into seqconcat_op

a35e82a6 · Yancey1989 · be3fa792 · f600ad1e · a35e82a6 · a35e82a6
161 changed file
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
 set -e
-unset OMP_NUM_THREADS MKL_NUM_THREADS
-export OMP_DYNAMIC="FALSE"
-export KMP_AFFINITY="granularity=fine,compact,0,0"
 function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS
+  export OMP_DYNAMIC="FALSE"
+  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
  bs=$2
  use_mkldnn=$3

--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
+# Design Doc: Python API
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+## Core Concepts
+### Program
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator needs to be able to access variables in its ancessor blocks.
+Whenever we create a block, we need set its parent block to the current block, so the Python class `Program` needs to maintain a data member `current_block`.
+```python
+class Program(objects):
+    def __init__(self):
+        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+    def global_block():
+        return self.blocks[0]
+    def current_block():
+        return self.get_block(self.current_block)
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+### Block
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.proto = core.NewBlock(program.proto)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+    def create_var(self, ...):
+        return Variable(self, ...)
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+`create_parameter` is necessary because parameters are global variables, those defined in the global block, but can be created in some sub-blocks, e.g., an FC layer in the step block of an RNN operator.
+`prepand_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+### Operator
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer output shape from input shape.
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
+        core.infer_shape(self.proto, inputs, outputs)
+    def type(self):
+        return self.proto.type()
+```
+`Operator` creates the `OpDesc` message in C++ space, so could it call the `InferShape` function, which is in C++.
+### Variable
+Operators take Variables as its inputs and outputs.
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.writer = None
+```
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each writes to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+### Parameter
+A parameter is a global variable with an initializer (or load) operator.
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+When users create a parameter, s/he can call
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+## Layer Functions
+A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
+### Data Layer
+```python
+def data_layer(name, type, column_name):
+    block = the_current_program.glolal_block()
+    var = block.create_global_var(
+            name=name,
+            shape=[None] + type.dims(),
+            dtype=type.dtype)
+    block.prepend_operator(block,
+                           type="Feed",
+                           inputs = None,
+                           outputs = [var],
+                           {column_name: column_name})
+    return var
+```
+The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
+### FC Layer
+```python
+def fc_layer(input, size, ...):
+    block = program.current_block()
+    w = block.create_parameter(...)
+    b = block.create_parameter(...)
+    out = block.create_var()
+    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
+    out.writer = op
+    return out
+```
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
 # Design Doc: Refactorization Overview
-The goal of refactorizaiton include:
+The goals of refactoring include:
-1. Make it easy for external contributors to write new elementory computaiton operations.
+1. Making it easy for external contributors to write new elementary computation operations.
-1. Make the codebase clean and readable.
+1. Making the codebase clean and readable.
-1. Introduce a new design of computation representation -- a computation graph of operators and variables.
+1. Designing a new computation representation -- a computation graph of operators and variables.
-1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
 ## Computation Graphs
-1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-  1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
-1. Users write Python programs to describe the graphs and run it (locally or remotely).
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
 1. A graph is composed of *variables* and *operators*.
-1. The description of graphs must be able to be serialized/deserialized, so it
+1. The description of graphs must be capable of being serialized/deserialized, so that:
-   1. could to be sent to the cloud for distributed execution, and
+   1. It can to be sent to the cloud for distributed execution, and
-   1. be sent to clients for mobile or enterprise deployment.
+   1. It can be sent to clients for mobile or enterprise deployment.
-1. The Python program do
+1. The Python program does the following steps
-   1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
+   1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
+   1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
-## Description and Realization
+## Description and Realization of Computation Graph
-At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph.
-At runtime, the C++ program realizes the graph and run it.
+At runtime, the C++ program realizes the graph and runs it.
 | | Representation (protobuf messages) | Realization (C++ class objects) |
 |---|---|---|
@@ -42,30 +42,31 @@ At runtime, the C++ program realizes the graph and run it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|
-The word *graph* is exchangable with *block* in this document.  A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
+The word *graph* is interchangeable with *block* in this document.  A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 ## Compilation and Execution
-1. Run an applicaton Python program to describe the graph.  In particular,
+1. Run an application Python program to describe the graph.  In particular, the Python application program does the following:
-   1. create VarDesc to represent local/intermediate variables,
+   1. Create `VarDesc` to represent local/intermediate variables,
-   1. create operators and set attributes,
+   1. Create operators and set attributes,
-   1. validate attribute values,
+   1. Validate attribute values,
-   1. inference the type and the shape of variables,
+   1. Infer the type and the shape of variables,
-   1. plan for memory-reuse for variables,
+   1. Plan memory-reuse for variables,
-   1. generate backward and optimization part of the Graph.
+   1. Generate the backward graph
-   1. possiblly split the graph for distributed training.
+   1. Optimize the computation graph.
+   1. Potentially, split the graph for distributed training.
-1. The invocation of `train` or `infer` in the application Python program:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following:
-   1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
      1. a scope is similar to the stack frame in programming languages,
-   1. create an instance of class `Block`, in which,
+   1. Create an instance of class `Block`, in which,
      1. realize operators in the BlockDesc message,
-   1. run the Block by calling
+   1. Run the Block by calling
      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
      1. `Block::Eval(vector<Operator>* targets)` for optimization.
@@ -76,14 +77,14 @@ The word *graph* is exchangable with *block* in this document.  A graph represen
 Compile Time -> IR -> Runtime
 ```
-### Benefit
+### Benefits of IR
 - Optimization
  ```text
  Compile Time -> IR -> Optimized IR -> Runtime
  ```
- Send automatically partitioned IR to different nodes.
+- Automatically send partitioned IR to different nodes.
-  - Automatic data parallel
+  - Automatic Data Parallelism
    ```text
    Compile Time
    |-> Single GPU IR
@@ -92,7 +93,7 @@ Compile Time -> IR -> Runtime
            |-> Node-1 (runs trainer-IR-1)
            |-> Node-2 (runs pserver-IR)
    ```
-  - Automatic model parallel (planned for future)
+  - Automatic Model Parallelism (planned for future)
 ---
@@ -105,10 +106,10 @@ Compile Time -> IR -> Runtime
 # Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
-* `Operator` is the fundamental building block as the user interface.
+* `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable name, and attributes.
+    * Operator stores input/output variable names, and attributes.
-    * The `InferShape` interface is used to infer output variable shapes by its input shapes.
+    * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables.
-    * Use `Run` to compute `input variables` to `output variables`.
+    * Use `Run` to compute the `output` variables from the `input` variables.
 ---
@@ -126,30 +127,29 @@ Compile Time -> IR -> Runtime
 # Why separate Kernel and Operator
 * Separate GPU and CPU code.
-    * Make Paddle can run without GPU.
+    * Make Paddle capable of running without GPU.
-* Make one operator (which is user interface) can contain many implementations.
+* Make one operator (which is a user interface) and create many implementations.
-    * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 # Libraries for Kernel development
 * `Eigen::Tensor` contains basic math and element-wise functions.
    * Note that `Eigen::Tensor` has broadcast implementation.
-    * Limit number of `tensor.device(dev) = ` in your code.
+    * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::tranform` and `std::transform`.
+* `thrust::transform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
+    * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
-    * Do not write `.h`. CPU Kernel should be in `.cc`. GPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Register
+# Operator Registration
-## Why register is necessary?
+## Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
-## How to do the register?
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
-Maintain a map, whose key is the type name and value is corresponding Op constructor.
 ---
 # The Registry Map
@@ -169,7 +169,7 @@ Maintain a map, whose key is the type name and value is corresponding Op constru
 # Related Concepts
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
 ### Register Macros
 ```cpp
@@ -177,34 +177,34 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
 REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
-### `USE` Macros
+### USE Macros
-make sure the registration process is executed and linked.
+Make sure the registration process is executed and linked.
 ---
-# Register Process
+# Registration Process
-1. Write Op class, as well as its gradient Op class if there is.
+1. Write an Op class and its gradient Op class, if required.
-2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
-3. Invoke macro `REGISTER_OP`. The macro will
+3. Invoke the macro `REGISTER_OP`. This macro will
-	1. call maker class to complete `proto` and `checker`
+	1. Call maker class to complete the `proto` and the `checker`
-	2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-4. Invoke `USE` macro in where the Op is used to make sure it is linked.
+4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked.
 ---
 # Backward Module (1/2)
 ### Create Backward Operator
- Mapping from forwarding Op to backward Op
+- Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 ---
 # Backward Module (2/2)
 ### Build Backward Network
- **Input** graph of forwarding operators
+- **Input**: graph of forward operators
- **Output** graph of backward operators
+- **Output**: graph of backward operators
- **corner case in construction**
+- **Corner cases in construction**
-	- shared variable => insert `Add` operator
+	- Shared Variables => insert an `Add` operator to combine gradients
-	- no gradient => insert `fill_zero_grad` operator
+	- No Gradient => insert a `fill_zero_grad` operator
-	- recursive netOp => call `Backward` recursively
+	- Recursive NetOp => call `Backward` recursively
 	- RNN Op => recursively call `Backward` on stepnet
@@ -213,41 +213,41 @@ make sure the registration process is executed and linked.
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
-	* All operators on `Tensor` is written in `Operator` or global functions.
+	* All operations on `Tensor` are written in `Operator` or global functions.
-	* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
+* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`.
-	* step_scopes in RNN is a variable and not a tensor.
+	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables store at.
+* `Scope` is where variables are stores.
-	* map<string/*var name */, Variable>
+	* map<string `variable_name`, Variable>
-	* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 ---
 # Block (in design)
-## the difference with original RNNOp
+## the difference between original RNNOp and Block
- as an operator is more intuitive than `RNNOp`,
+- As an operator is more intuitive than `RNNOp`,
- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- fits the compile-time/ runtime separation design.
+- Fits the compile-time/ runtime separation design paradigm.
-  - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
-  - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 ---
 # Milestone
- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- model migration
+- Model migration
-  - framework development gives **priority support** to model migration, for example,
+  - Framework development gives **priority support** to model migration, for example,
    - the MNIST demo needs a Python interface,
    - the RNN models require the framework to support `LoDTensor`.
-  - determine some timelines,
+  - Determine some timelines,
-  - heavily-relied Ops need to be migrated first,
+  - Frequently used Ops need to be migrated first,
-  - different models can be migrated parallelly.
+  - Different models can be migrated in parallel.
- improve the framework at the same time
+- Improve the framework at the same time
- accept imperfection, concentrated on solving the specific problem at the right price.
+- Accept imperfection, concentrate on solving the specific problem at the right price.
 ---
 # Control the migration quality
- compare the performance of migrated models with old ones.
+- Compare the performance of migrated models with old ones.
- follow google C style
+- Follow the google C++ style
- build the automatic workflow of generating Python/C++ documentations
+- Build the automatic workflow of generating Python/C++ documentations.
-  - the documentation of layers and ops should be written inside the code
+  - The documentation of layers and ops should be written inside the code.
-  - take the documentation quality into account when doing PR
+  - Take the documentation quality into account when submitting pull requests.
-  - preview the documentations, read and improve them from users' perspective
+  - Preview the documentations, read and improve them from a user's perspective.
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 下面解释代码中一些关键的地方:
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 ### 编译和执行单元测试

--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
-        self.compare_grad(self.op, self.inputs)
-    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
 ```
 Some key points in the code above include:
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
- `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
+  - The second variable `"Out"` points to the network's final output target `Out`.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
 ### Compiling and Running

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,13 +19,14 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc)
 cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+VarDescBind *BlockDescBind::NewVar(const std::string &name) {
+  need_update_ = true;
+  auto it = vars_.find(name);
+  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
+  auto var = new VarDescBind(name);
+  vars_[name].reset(var);
+  return var;
+}
+VarDescBind *BlockDescBind::Var(const std::string &name) const {
+  auto it = vars_.find(name);
+  PADDLE_ENFORCE(it != vars_.end(),
+                 "Can not find variable %s in current block.", name);
+  return it->second.get();
+}
+std::vector<VarDescBind *> BlockDescBind::AllVars() const {
+  std::vector<VarDescBind *> res;
+  for (const auto &p : vars_) {
+    res.push_back(p.second.get());
+  }
+  return res;
+}
+OpDescBind *BlockDescBind::AppendOp() {
+  need_update_ = true;
+  ops_.emplace_back(new OpDescBind());
+  return ops_.back().get();
+}
+OpDescBind *BlockDescBind::PrependOp() {
+  need_update_ = true;
+  ops_.emplace_front(new OpDescBind());
+  return ops_.front().get();
+}
+std::vector<OpDescBind *> BlockDescBind::AllOps() const {
+  std::vector<OpDescBind *> res;
+  for (const auto &op : ops_) {
+    res.push_back(op.get());
+  }
+  return res;
+}
+void BlockDescBind::Sync() {
+  if (need_update_) {
+    auto &op_field = *this->desc_->mutable_ops();
+    op_field.Clear();
+    op_field.Reserve(static_cast<int>(ops_.size()));
+    for (auto &op_desc : ops_) {
+      op_field.AddAllocated(op_desc->Proto());
+    }
+    need_update_ = false;
+  }
+}
+BlockDescBind *BlockDescBind::ParentBlock() const {
+  if (this->desc_->parent_idx() == -1) {
+    return nullptr;
+  }
+  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
+}
+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.RawPtr();
+  this->attrs_[name] = desc;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <deque>
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/var_desc.h"
+namespace paddle {
+namespace framework {
+class ProgramDescBind;
+// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
+// read/write speed. Only when we want the protobuf message, the local changes
+// will be synchronized (by `Sync` method).
+class BlockDescBind {
+ public:
+  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+      : prog_(prog), desc_(desc), need_update_(false) {}
+  BlockDescBind(const BlockDescBind &o) = delete;
+  BlockDescBind &operator=(const BlockDescBind &o) = delete;
+  int32_t ID() const { return desc_->idx(); }
+  int32_t Parent() const { return desc_->parent_idx(); }
+  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes) const;
+  std::vector<VarDescBind *> AllVars() const;
+  BlockDescBind *ParentBlock() const;
+  OpDescBind *AppendOp();
+  OpDescBind *PrependOp();
+  std::vector<OpDescBind *> AllOps() const;
+  void Sync();
+  BlockDesc *RawPtr() { return desc_; }
+ private:
+  ProgramDescBind *prog_;  // not_own
+  BlockDesc *desc_;        // not_own
+  bool need_update_;
+  std::deque<std::unique_ptr<OpDescBind>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+namespace paddle {
+namespace framework {
+inline DataType ToDataType(std::type_index type) {
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else {
+    PADDLE_THROW("Not supported");
+    return static_cast<DataType>(-1);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -54,5 +54,44 @@ OperatorBase* BuildGradOp(const OperatorBase* op) {
  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
 }
+static void TransOpDescArg(const OpDescBind* src_op, const OpArgType& src_type,
+                           bool is_grad, OpDescBind* dst_op,
+                           const OpArgType& dst_type) {
+  PADDLE_ENFORCE(dst_op != nullptr,
+                 "Protobuf desc of gradient op must be initialized first.");
+  const auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
+  const auto& src_arg_list =
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
+  for (const auto& arg : src_arg_list) {
+    if (arg.not_in_gradient() && !is_grad) continue;
+    const std::string src_name = arg.name();
+    std::vector<std::string> vars = src_type == OpArgType::IN
+                                        ? src_op->Input(src_name)
+                                        : src_op->Output(src_name);
+    if (is_grad) {
+      for (std::string& var : vars) {
+        var = GradVarName(var);
+      }
+    }
+    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
+    dst_type == OpArgType::IN ? dst_op->SetInput(dst_name, vars)
+                              : dst_op->SetOutput(dst_name, vars);
+  }
+}
+void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op) {
+  auto& info = OpInfoMap::Instance().Get(forw_op->Type());
+  PADDLE_ENFORCE(info.HasGradientOp());
+  grad_op->SetType(info.grad_op_type_);
+  TransOpDescArg(forw_op, OpArgType::IN, false, grad_op, OpArgType::IN);
+  TransOpDescArg(forw_op, OpArgType::OUT, false, grad_op, OpArgType::IN);
+  TransOpDescArg(forw_op, OpArgType::OUT, true, grad_op, OpArgType::IN);
+  TransOpDescArg(forw_op, OpArgType::IN, true, grad_op, OpArgType::OUT);
+  grad_op->SetAttrMap(forw_op->GetAttrMap());
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 namespace paddle {
@@ -21,5 +22,7 @@ namespace framework {
 OperatorBase* BuildGradOp(const OperatorBase* op);
+void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -120,3 +120,82 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
            std::vector<std::string>(
                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
 }
+TEST(GradOpDescBuilder, MutiInOut) {
+  f::OpDescBind *forw_op = new f::OpDescBind();
+  forw_op->SetType("mult_io");
+  forw_op->SetInput("In1", {"in1"});
+  forw_op->SetInput("In2_mult", {"in2_1", "in2_2", "in2_3"});
+  forw_op->SetInput("In3", {"in3"});
+  forw_op->SetOutput("Out1", {"out1"});
+  forw_op->SetOutput("Out2_mult", {"out2_1", "out2_2"});
+  f::OpDescBind *grad_op = new f::OpDescBind();
+  f::CompleteGradOpDesc(forw_op, grad_op);
+  EXPECT_EQ(grad_op->Type(), "mult_io_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 3UL + 2UL + 2UL);
+  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
+  EXPECT_EQ(grad_op->Input("In2_mult"),
+            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
+  EXPECT_EQ(grad_op->Input("In3"), std::vector<std::string>({"in3"}));
+  EXPECT_EQ(grad_op->Input("Out1"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op->Input("Out2_mult"),
+            std::vector<std::string>({"out2_1", "out2_2"}));
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
+  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
+            std::vector<std::string>({f::GradVarName("in1")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
+            std::vector<std::string>({f::GradVarName("in2_1"),
+                                      f::GradVarName("in2_2"),
+                                      f::GradVarName("in2_3")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In3")),
+            std::vector<std::string>({f::GradVarName("in3")}));
+  delete forw_op;
+  delete grad_op;
+}
+TEST(GradOpDescBuilder, IOIgnoredInGradient) {
+  f::OpDescBind *forw_op = new f::OpDescBind();
+  forw_op->SetType("io_ignored");
+  forw_op->SetInput("In1", {"in1"});
+  forw_op->SetInput("In2_mult", {"in2_1", "in2_2"});
+  forw_op->SetInput("In3_mult", {"in3_1", "in3_2"});
+  forw_op->SetOutput("Out1_mult", {"out1_1", "out1_2"});
+  forw_op->SetOutput("Out2", {"out2"});
+  f::OpDescBind *grad_op = new f::OpDescBind();
+  f::CompleteGradOpDesc(forw_op, grad_op);
+  EXPECT_EQ(grad_op->Type(), "io_ignored_grad");
+  // 'In2' and 'Out2' are ignored in gradient calculating
+  ASSERT_EQ(grad_op->InputNames().size(), 2UL + 1UL + 2UL);
+  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
+  EXPECT_EQ(grad_op->Input("In3_mult"),
+            std::vector<std::string>({"in3_1", "in3_2"}));
+  EXPECT_EQ(grad_op->Input("Out1_mult"),
+            std::vector<std::string>({"out1_1", "out1_2"}));
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
+            std::vector<std::string>({f::GradVarName("in1")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("In3_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
+  delete forw_op;
+  delete grad_op;
+}
\ No newline at end of file
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/block_desc.h"
+namespace paddle {
+namespace framework {
+OpDesc *OpDescBind::Proto() {
+  Sync();
+  return &op_desc_;
+}
+const std::vector<std::string> &OpDescBind::Input(
+    const std::string &name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
+                 Type());
+  return it->second;
+}
+std::vector<std::string> OpDescBind::InputNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(this->inputs_.size());
+  for (auto &ipt : this->inputs_) {
+    retv.push_back(ipt.first);
+  }
+  return retv;
+}
+void OpDescBind::SetInput(const std::string &param_name,
+                          const std::vector<std::string> &args) {
+  need_update_ = true;
+  inputs_[param_name] = args;
+}
+const std::vector<std::string> &OpDescBind::Output(
+    const std::string &name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
+                 name, Type());
+  return it->second;
+}
+std::vector<std::string> OpDescBind::OutputNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(this->outputs_.size());
+  for (auto &ipt : this->outputs_) {
+    retv.push_back(ipt.first);
+  }
+  return retv;
+}
+void OpDescBind::SetOutput(const std::string &param_name,
+                           const std::vector<std::string> &args) {
+  need_update_ = true;
+  this->outputs_[param_name] = args;
+}
+AttrType OpDescBind::GetAttrType(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return static_cast<AttrType>(it->second.which() - 1);
+}
+std::vector<std::string> OpDescBind::AttrNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(attrs_.size());
+  for (auto &attr : attrs_) {
+    retv.push_back(attr.first);
+  }
+  return retv;
+}
+void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+  this->attrs_[name] = v;
+  need_update_ = true;
+}
+void OpDescBind::SetAttrMap(
+    const std::unordered_map<std::string, Attribute> &attr_map) {
+  attrs_ = attr_map;
+  need_update_ = true;
+}
+Attribute OpDescBind::GetAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return it->second;
+}
+int OpDescBind::GetBlockAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return boost::get<BlockDesc *>(it->second)->idx();
+}
+const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
+    const {
+  return attrs_;
+}
+void OpDescBind::Sync() {
+  if (need_update_) {
+    this->op_desc_.mutable_inputs()->Clear();
+    for (auto &ipt : inputs_) {
+      auto *input = op_desc_.add_inputs();
+      input->set_parameter(ipt.first);
+      VectorToRepeated(ipt.second, input->mutable_arguments());
+    }
+    this->op_desc_.mutable_outputs()->Clear();
+    for (auto &opt : outputs_) {
+      auto *output = op_desc_.add_outputs();
+      output->set_parameter(opt.first);
+      VectorToRepeated(opt.second, output->mutable_arguments());
+    }
+    this->op_desc_.mutable_attrs()->Clear();
+    for (auto &attr : attrs_) {
+      auto *attr_desc = op_desc_.add_attrs();
+      attr_desc->set_name(attr.first);
+      attr_desc->set_type(
+          static_cast<framework::AttrType>(attr.second.which() - 1));
+      boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
+    }
+    need_update_ = false;
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/var_desc.h"
+namespace paddle {
+namespace framework {
+class BlockDescBind;
+class OpDescBind {
+ public:
+  OpDesc *Proto();
+  std::string Type() const { return op_desc_.type(); }
+  void SetType(const std::string &type) { op_desc_.set_type(type); }
+  const std::vector<std::string> &Input(const std::string &name) const;
+  std::vector<std::string> InputNames() const;
+  void SetInput(const std::string &param_name,
+                const std::vector<std::string> &args);
+  const std::vector<std::string> &Output(const std::string &name) const;
+  std::vector<std::string> OutputNames() const;
+  void SetOutput(const std::string &param_name,
+                 const std::vector<std::string> &args);
+  std::string DebugString() { return this->Proto()->DebugString(); }
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+  AttrType GetAttrType(const std::string &name) const;
+  std::vector<std::string> AttrNames() const;
+  void SetAttr(const std::string &name, const Attribute &v);
+  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  // Only be used in C++
+  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
+  Attribute GetAttr(const std::string &name) const;
+  int GetBlockAttr(const std::string &name) const;
+  // Only be used in C++
+  const std::unordered_map<std::string, Attribute> &GetAttrMap() const;
+ private:
+  struct SetAttrDescVisitor : public boost::static_visitor<void> {
+    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+    mutable OpDesc::Attr *attr_;
+    void operator()(int v) const { attr_->set_i(v); }
+    void operator()(float v) const { attr_->set_f(v); }
+    void operator()(const std::string &v) const { attr_->set_s(v); }
+    void operator()(bool b) const { attr_->set_b(b); }
+    void operator()(const std::vector<int> &v) const {
+      VectorToRepeated(v, attr_->mutable_ints());
+    }
+    void operator()(const std::vector<float> &v) const {
+      VectorToRepeated(v, attr_->mutable_floats());
+    }
+    void operator()(const std::vector<std::string> &v) const {
+      VectorToRepeated(v, attr_->mutable_strings());
+    }
+    void operator()(const std::vector<bool> &v) const {
+      VectorToRepeated(v, attr_->mutable_bools());
+    }
+    void operator()(BlockDesc *desc) const {
+      attr_->set_block_idx(desc->idx());
+    }
+    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+  };
+  void Sync();
+  OpDesc op_desc_;
+  std::unordered_map<std::string, std::vector<std::string>> inputs_;
+  std::unordered_map<std::string, std::vector<std::string>> outputs_;
+  std::unordered_map<std::string, Attribute> attrs_;
+  // need_update_ indicate there some local changes not be synchronized. If
+  // local changes should be synchronized, need_update_ should be set to true.
+  bool need_update_{false};
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -100,13 +100,39 @@ class OpRegistrar : public Registrar {
  }
 };
-template <typename PlaceType, typename KernelType>
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+  void operator()(const char* op_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+                                        PlaceType());
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type);
+  }
+};
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type) const {}
+};
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
  explicit OpKernelRegistrar(const char* op_type) {
-    OperatorWithKernel::OpKernelKey key;
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    key.place_ = PlaceType();
+    func(op_type);
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
  }
 };

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -10,7 +10,6 @@ class CosineOp : public OperatorBase {
  using OperatorBase::OperatorBase;
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const Scope& scope) const override {}
 };
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -29,7 +28,6 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
 };

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.GetEigenDevice<platform::GPUPlace>();
 }
 #endif

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "op_info.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
@@ -83,10 +84,6 @@ class OperatorBase {
  virtual std::string DebugString() const;
-  /// InferShape infer the size of Variables used by this Operator with
-  /// information inside scope
-  virtual void InferShape(const Scope& scope) const = 0;
  /// Net will call this function to Run an op.
  virtual void Run(const Scope& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;
@@ -164,7 +161,6 @@ class OperatorBase {
 class NOP : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
  std::unique_ptr<OperatorBase> Clone() const override {
@@ -300,21 +296,6 @@ template <>
 std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
    const std::string& name) const;
-template <typename T>
-struct EigenDeviceConverter;
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
 class ExecutionContext : public InferShapeContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -322,8 +303,8 @@ class ExecutionContext : public InferShapeContext {
      : InferShapeContext(op, scope), device_context_(device_context) {}
  template <typename PlaceType,
-            typename DeviceType =
+            typename DeviceType = typename platform::EigenDeviceConverter<
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+                PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;
  platform::Place GetPlace() const { return device_context_.GetPlace(); }
@@ -353,6 +334,32 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
    return var != nullptr;
  }
+  bool HasInputs(const std::string& name) const {
+    auto inputs = op_.Inputs(name);
+    if (inputs.size() == 0UL) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+  bool HasOutputs(const std::string& name) const {
+    auto outputs = op_.Outputs(name);
+    if (outputs.size() == 0UL) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
  DDim GetInputDim(const std::string& name) const {
    return GetDim(op_.Input(name));
  }
@@ -408,7 +415,7 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
  const Scope& scope_;
 };
-class OpKernel {
+class OpKernelBase {
 public:
  /**
   * ExecutionContext is the only parameter of Kernel Run function.
@@ -419,48 +426,61 @@ class OpKernel {
  virtual void Compute(const ExecutionContext& context) const = 0;
-  virtual ~OpKernel() {}
+  virtual ~OpKernelBase() = default;
+};
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
 };
 class OperatorWithKernel : public OperatorBase {
 public:
  struct OpKernelKey {
    platform::Place place_;
+    DataType data_type_;
-    OpKernelKey() = default;
+    OpKernelKey(DataType data_type, platform::Place place)
-    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
+        : place_(place), data_type_(data_type) {}
-      place_ = dev_ctx.GetPlace();
-    }
+    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
+        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_);
+      return platform::places_are_same_class(place_, o.place_) &&
+             data_type_ == o.data_type_;
    }
  };
  struct OpKernelHash {
-    std::hash<bool> hash_;
+    std::hash<int> hash_;
    size_t operator()(const OpKernelKey& key) const {
-      return hash_(platform::is_gpu_place(key.place_));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_);
+      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
+                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
+      return hash_(pre_hash);
    }
  };
  using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+                         OpKernelHash>;
  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  // runtime infershape
-  void InferShape(const Scope& scope) const override {
-    auto c = RuntimeInferShapeContext(*this, scope);
-    InferShape(&c);
-  }
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
+    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
+    this->InferShape(&infer_shape_ctx);
+    ExecutionContext ctx(*this, scope, dev_ctx);
+    auto& opKernel = AllOpKernels().at(type_).at(
+        OpKernelKey(IndicateDataType(ctx), dev_ctx));
+    opKernel->Compute(ctx);
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -470,13 +490,43 @@ class OperatorWithKernel : public OperatorBase {
  }
  bool SupportGPU() const override {
-    OperatorWithKernel::OpKernelKey key;
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
-    key.place_ = platform::GPUPlace();
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
-    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
  }
 protected:
  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    auto& scope = ctx.scope();
+    int data_type = -1;
+    for (auto& input : this->inputs_) {
+      for (auto& ipt_name : input.second) {
+        auto* var = scope.FindVar(ipt_name);
+        if (var != nullptr) {
+          const Tensor* t = nullptr;
+          if (var->IsType<Tensor>()) {
+            t = &var->Get<Tensor>();
+          } else if (var->IsType<LoDTensor>()) {
+            t = &var->Get<LoDTensor>();
+          }
+          if (t != nullptr) {
+            int tmp = static_cast<int>(ToDataType(t->type()));
+            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                           "DataType of Paddle Op must be same.");
+            data_type = tmp;
+          }
+        }
+      }
+    }
+    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+    return static_cast<DataType>(data_type);
+  }
 };
 }  // namespace framework

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -27,7 +27,6 @@ class OpWithoutKernelTest : public OperatorBase {
  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    ++op_run_num;
@@ -87,7 +86,6 @@ TEST(OperatorBase, all) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  scope.NewVar("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->InferShape(scope);
  op->Run(scope, device_context);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
@@ -116,10 +114,13 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+    return DataType::FP32;
+  }
 };
 template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel {
+class CPUKernelTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    std::cout << "this is cpu kernel" << std::endl;
@@ -146,7 +147,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
  }
 };
-class CPUKernalMultiInputsTest : public OpKernel {
+class CPUKernalMultiInputsTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    auto xs = ctx.op().Inputs("xs");
@@ -255,7 +256,6 @@ class OperatorClone : public paddle::framework::OperatorBase {
                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void InferShape(const paddle::framework::Scope& scope) const override {}
  void Run(const paddle::framework::Scope& scope,
           const paddle::platform::DeviceContext& dev_ctx) const override {}
 };

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/block_desc.h"
+namespace paddle {
+namespace framework {
+using ProgDescMap =
+    std::unordered_map<ProgramDesc *, std::unique_ptr<ProgramDescBind>>;
+static ProgDescMap *g_bind_map = nullptr;
+ProgramDescBind &ProgramDescBind::Instance(ProgramDesc *prog) {
+  if (g_bind_map == nullptr) {
+    g_bind_map = new ProgDescMap();
+  }
+  auto &map = *g_bind_map;
+  auto &ptr = map[prog];
+  if (ptr == nullptr) {
+    ptr.reset(new ProgramDescBind(prog));
+  }
+  return *ptr;
+}
+BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+  auto *b = prog_->add_blocks();
+  b->set_parent_idx(parent.ID());
+  b->set_idx(prog_->blocks_size() - 1);
+  blocks_.emplace_back(new BlockDescBind(this, b));
+  return blocks_.back().get();
+}
+ProgramDesc *ProgramDescBind::Proto() {
+  for (auto &block : blocks_) {
+    block->Sync();
+  }
+  return prog_;
+}
+ProgramDescBind::ProgramDescBind(ProgramDesc *prog) {
+  prog_ = prog;
+  for (auto &block : *prog->mutable_blocks()) {
+    blocks_.emplace_back(new BlockDescBind(this, &block));
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/framework.pb.h"
+namespace paddle {
+namespace framework {
+class BlockDescBind;
+class ProgramDescBind {
+ public:
+  static ProgramDescBind &Instance(ProgramDesc *prog);
+  ProgramDescBind(const ProgramDescBind &o) = delete;
+  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
+  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
+  std::string DebugString() { return Proto()->DebugString(); }
+  size_t Size() const { return blocks_.size(); }
+  ProgramDesc *Proto();
+ private:
+  explicit ProgramDescBind(ProgramDesc *prog);
+  // Not owned
+  ProgramDesc *prog_;
+  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -24,6 +24,10 @@ class InferShapeContextBase {
  virtual ~InferShapeContextBase() {}
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
  std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
    const std::vector<std::string> &names = Inputs(name);

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,20 +29,10 @@ limitations under the License. */
 namespace paddle {
-namespace pybind {
-namespace details {
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}
-}  // namespace pybind
 namespace framework {
 class Tensor {
 public:
-  template <bool less, size_t i, typename... args>
-  friend struct pybind::details::CastToPyBufferImpl;
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
@@ -119,6 +109,8 @@ class Tensor {
    return holder_->place();
  }
+  std::type_index type() const { return holder_->type(); }
 private:
  template <typename T>
  inline void check_memory_size() const;

--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/var_desc.h"
+namespace paddle {
+namespace framework {
+void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
+}
+void VarDescBind::SetDataType(DataType data_type) {
+  desc_.mutable_lod_tensor()->set_data_type(data_type);
+}
+std::vector<int64_t> VarDescBind::Shape() const {
+  return RepeatedToVector(desc_.lod_tensor().dims());
+}
+DataType VarDescBind::GetDataType() const {
+  return desc_.lod_tensor().data_type();
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/framework.pb.h"
+namespace paddle {
+namespace framework {
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(repeated_field.begin(), repeated_field.end(),
+            std::back_inserter(ret));
+  return ret;
+}
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+class VarDescBind {
+ public:
+  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
+  VarDesc *Proto() { return &desc_; }
+  std::string Name() const { return desc_.name(); }
+  void SetShape(const std::vector<int64_t> &dims);
+  void SetDataType(DataType data_type);
+  std::vector<int64_t> Shape() const;
+  DataType GetDataType() const;
+ private:
+  VarDesc desc_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.h
+++ b/paddle/function/neon/NeonDepthwiseConv.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "neon_util.h"
 namespace paddle {
 namespace neon {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
@@ -26,17 +25,20 @@ namespace neon {
 template <int filterSize, int stride>
 struct DepthwiseConvKernel {};
-inline float32_t conv3x3(float32x4_t r0,
+inline float32_t conv3x3(const float* r0,
-                         float32x4_t r1,
+                         const float* r1,
-                         float32x4_t r2,
+                         const float* r2,
                         float32x4_t k0,
                         float32x4_t k1,
                         float32x4_t k2) {
-  float32x4_t tmp;
+  float32_t tmp[12];
-  tmp = vmulq_f32(r0, k0);
+  vst1q_f32(&(tmp[0]), k0);
-  tmp = vmlaq_f32(tmp, r1, k1);
+  vst1q_f32(&(tmp[4]), k1);
-  tmp = vmlaq_f32(tmp, r2, k2);
+  vst1q_f32(&(tmp[8]), k2);
-  return vaddvq_f32(tmp);
+  float32_t sum0 = r0[0] * tmp[0] + r0[1] * tmp[1] + r0[2] * tmp[2];
+  float32_t sum1 = r1[0] * tmp[4] + r1[1] * tmp[5] + r1[2] * tmp[6];
+  float32_t sum2 = r2[0] * tmp[8] + r2[1] * tmp[9] + r2[2] * tmp[10];
+  return sum0 + sum1 + sum2;
 }
 inline float32_t conv4x4(float32x4_t r0,
@@ -136,10 +138,7 @@ struct DepthwiseConvKernel<3, 1> {
        }
        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
+          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
          r0++;
          r1++;
          r2++;
@@ -243,10 +242,7 @@ struct DepthwiseConvKernel<3, 2> {
        }
        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
+          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          *outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
          r0 += 2;
          r1 += 2;
          r2 += 2;

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -215,13 +215,13 @@ struct testActDesc {
 static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
  cfg.biasSize = 0;
  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ih * pm.ih * pm.iw;
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
  cfg.layerConfig.set_size(layerSize);
  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
  cfg.layerConfig.add_inputs();
 }
-void testActivation(std::string& actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testActDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {
    return;
@@ -240,6 +240,7 @@ TEST(MKLDNNActivation, Activations) {
  for (auto type : types) {
    /* bs, c, h, w*/
    testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
  }
 }

--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -99,7 +99,11 @@ public:
  /**
   * @brief clear local buffer. It only affect auto-growth buffer.
   */
-  inline void clear() { rowStore_.clear(); }
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
  /**
   * @brief get current number of rows.

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,12 +55,25 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()
+    if ("${TARGET}" STREQUAL "pool_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
+    endif()
    # activation_op contains several operators
    if ("${TARGET}" STREQUAL "activation_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
    endif()
+    # reduce_op contains several operators
+    if ("${TARGET}" STREQUAL "reduce_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
+    endif()
    # pybind USE_NO_KERNEL_OP
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -94,8 +107,8 @@ set(DEPS_OPS
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
  DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
-op_library(cross_entropy_op DEPS cross_entropy_function)
+op_library(cross_entropy_op DEPS cross_entropy)
-op_library(softmax_with_cross_entropy_op DEPS cross_entropy_function softmax_function)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -47,7 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
 }
 template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel {
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),

--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -35,7 +35,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class AccuracyKernel : public framework::OpKernel {
+class AccuracyKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* inference = ctx.Input<Tensor>("Inference");

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -132,6 +132,17 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };
+class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftsignOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softsign operator");
+    AddOutput("Y", "Output of Softsign operator");
+    AddComment("Softsign activation operator, softsign(x) = x / (1 + |x|)");
+  }
+};
 template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
@@ -195,111 +206,57 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SigmoidFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                            ops::SigmoidGradFunctor<float>>);
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    exp,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_CPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::ExpGradFunctor>);
 REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
 REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_CPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::AbsGradFunctor>);
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
            reciprocal_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
 REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
 REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-                                             ops::SquareFunctor>);
+            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
 REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
            soft_relu_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
            ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::CPUPlace, float>);
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
-REGISTER_OP_CPU_KERNEL(stanh_grad,
+  REGISTER_OP_CPU_KERNEL(                                                      \
-                       ops::STanhGradKernel<paddle::platform::CPUPlace, float>);
+      act_type,                                                                \
+      paddle::operators::ActivationKernel<paddle::platform::CPUPlace,          \
+                                          paddle::operators::functor<float>>); \
+  REGISTER_OP_CPU_KERNEL(act_type##_grad,                                      \
+                         paddle::operators::ActivationGradKernel<              \
+                             paddle::platform::CPUPlace,                       \
+                             paddle::operators::grad_functor<float>>);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -15,86 +15,14 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/activation_op.h"
-namespace ops = paddle::operators;
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)        \
+  REGISTER_OP_GPU_KERNEL(                                                      \
-REGISTER_OP_GPU_KERNEL(sigmoid,
+      act_type,                                                                \
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+      paddle::operators::ActivationKernel<paddle::platform::GPUPlace,          \
-                                             ops::SigmoidFunctor<float>>);
+                                          paddle::operators::functor<float>>); \
-REGISTER_OP_GPU_KERNEL(
+  REGISTER_OP_GPU_KERNEL(act_type##_grad,                                      \
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                         paddle::operators::ActivationGradKernel<              \
-                                            ops::SigmoidGradFunctor<float>>);
+                             paddle::platform::GPUPlace,                       \
+                             paddle::operators::grad_functor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    exp,
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_GPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::ExpGradFunctor>);
-REGISTER_OP_GPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_GPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::AbsGradFunctor>);
-REGISTER_OP_GPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SquareFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,9 +19,12 @@
 namespace paddle {
 namespace operators {
-template <typename Place, typename T, typename Functor>
+template <typename Place, typename Functor>
-class ActivationKernel : public framework::OpKernel {
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
+  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Output<framework::Tensor>("Y");
@@ -31,13 +34,20 @@ class ActivationKernel : public framework::OpKernel {
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
    functor(place, x, y);
  }
 };
-template <typename Place, typename T, typename Functor>
+template <typename Place, typename Functor>
-class ActivationGradKernel : public framework::OpKernel {
+class ActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
+  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Input<framework::Tensor>("Y");
@@ -51,303 +61,322 @@ class ActivationGradKernel : public framework::OpKernel {
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
    functor(place, x, y, dy, dx);
  }
 };
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+  AttrPair GetAttrs() { return AttrPair(); }
+};
 // sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
-struct SigmoidFunctor {
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
  }
 };
 template <typename T>
-struct SigmoidGradFunctor {
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * y * (static_cast<T>(1) - y);
  }
 };
 // exp(x) = e^x
-struct ExpFunctor {
+template <typename T>
+struct ExpFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.exp();
  }
 };
-struct ExpGradFunctor {
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * y;
  }
 };
 // relu(x) = max(x, 0)
 template <typename T>
-struct ReluFunctor {
+struct ReluFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(static_cast<T>(0));
  }
 };
 template <typename T>
-struct ReluGradFunctor {
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
  }
 };
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-struct TanhFunctor {
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.tanh();
  }
 };
 template <typename T>
-struct TanhGradFunctor {
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (static_cast<T>(1) - y * y);
  }
 };
 // sqrt(x) = x^(1/2)
-struct SqrtFunctor {
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.sqrt();
  }
 };
 template <typename T>
-struct SqrtGradFunctor {
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    const Y y_conj = Eigen::numext::conj(y);
    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
  }
 };
 // abs(x) = |x|
-struct AbsFunctor {
+template <typename T>
+struct AbsFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.abs();
  }
 };
-struct AbsGradFunctor {
+template <typename T>
+struct AbsGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * x.sign();
  }
 };
 // reciprocal(x) = 1 / x
 template <typename T>
-struct ReciprocalFunctor {
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = static_cast<T>(1) / x;
  }
 };
 template <typename T>
-struct ReciprocalGradFunctor {
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * static_cast<T>(-1) * y * y;
  }
 };
 // log(x) = natural logarithm of x
-struct LogFunctor {
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.log();
  }
 };
 template <typename T>
-struct LogGradFunctor {
+struct LogGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (static_cast<T>(1) / x);
  }
 };
 // square(x) = x^2
-struct SquareFunctor {
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.square();
  }
 };
 template <typename T>
-struct SquareGradFunctor {
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * static_cast<T>(2) * x;
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class BReluKernel : public framework::OpKernel {
+struct BReluFunctor : public BaseActivationFunctor<T> {
- public:
+  float t_min;
-  void Compute(const framework::ExecutionContext& context) const override {
+  float t_max;
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+  // not polymorphism for speed.
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    Y->mutable_data<T>(context.GetPlace());
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
-    auto x = framework::EigenVector<T>::Flatten(*X);
+  template <typename Device, typename X, typename Y>
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+  void operator()(Device d, X x, Y y) const {
-    auto place = context.GetEigenDevice<Place>();
+    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
-    y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max);
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class BReluGradKernel : public framework::OpKernel {
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
- public:
+  float t_min;
-  void Compute(const framework::ExecutionContext& context) const override {
+  float t_max;
-    auto* X = context.Input<framework::Tensor>("X");
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  }
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dX->mutable_data<T>(context.GetPlace());
+    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  }
+};
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
+// softsign(x) = x / (1 + |x|)
-    auto x = framework::EigenVector<T>::Flatten(*X);
+template <typename T>
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
-    auto place = context.GetEigenDevice<Place>();
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x / (static_cast<T>(1) + x.abs());
+  }
+};
-    dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+// d(softsign(x))/dx = 1 / (1 + |x|)^2
+// Taken from https://en.wikipedia.org/wiki/Activation_function
+template <typename T>
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) =
+        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class SoftReluKernel : public framework::OpKernel {
+struct SoftReluFunctor : public BaseActivationFunctor<T> {
- public:
+  float threshold;
-  void Compute(const framework::ExecutionContext& context) const override {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* X = context.Input<framework::Tensor>("X");
+    return {{"threshold", &threshold}};
-    auto* Y = context.Output<framework::Tensor>("Y");
+  }
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    Y->mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(*X);
+  template <typename Device, typename X, typename Y>
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+  void operator()(Device d, X x, Y y) const {
-    auto place = context.GetEigenDevice<Place>();
+    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
-    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
+    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
-    y.device(place) = (static_cast<T>(1) + temp.exp()).log();
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class SoftReluGradKernel : public framework::OpKernel {
+struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
- public:
+  float threshold;
-  void Compute(const framework::ExecutionContext& context) const override {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* X = context.Input<framework::Tensor>("X");
+    return {{"threshold", &threshold}};
-    auto* Y = context.Input<framework::Tensor>("Y");
+  }
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    dX->mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
-    dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class PowKernel : public framework::OpKernel {
+struct PowFunctor : public BaseActivationFunctor<T> {
- public:
+  float factor;
-  void Compute(const framework::ExecutionContext& context) const override {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* X = context.Input<framework::Tensor>("X");
+    return {{"factor", &factor}};
-    auto* Y = context.Output<framework::Tensor>("Y");
+  }
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+  template <typename Device, typename X, typename Y>
-    Y->mutable_data<T>(context.GetPlace());
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.pow(factor);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = x.pow(factor);
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class PowGradKernel : public framework::OpKernel {
+struct PowGradFunctor : public BaseActivationFunctor<T> {
- public:
+  float factor;
-  void Compute(const framework::ExecutionContext& context) const override {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* X = context.Input<framework::Tensor>("X");
+    return {{"factor", &factor}};
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+  }
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dX->mutable_data<T>(context.GetPlace());
+    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
-    dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class STanhKernel : public framework::OpKernel {
+struct STanhFunctor : public BaseActivationFunctor<T> {
- public:
+  float scale_a;
-  void Compute(const framework::ExecutionContext& context) const override {
+  float scale_b;
-    auto* X = context.Input<framework::Tensor>("X");
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* Y = context.Output<framework::Tensor>("Y");
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
+  }
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    Y->mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(*X);
+  template <typename Device, typename X, typename Y>
-    auto y = framework::EigenVector<T>::Flatten(*Y);
+  void operator()(Device d, X x, Y y) const {
-    auto place = context.GetEigenDevice<Place>();
+    y.device(d) = scale_b * (scale_a * x).tanh();
-    y.device(place) = scale_b * (scale_a * x).tanh();
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename T>
-class STanhGradKernel : public framework::OpKernel {
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
- public:
+  float scale_a;
-  void Compute(const framework::ExecutionContext& context) const override {
+  float scale_b;
-    auto* X = context.Input<framework::Tensor>("X");
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+  }
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    dX->mutable_data<T>(context.GetPlace());
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
-    dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
  }
 };
 }  // namespace operators
 }  // namespace paddle
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                         \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);          \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                      \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                   \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                   \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                   \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                      \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(log, LogFunctor, LogGradFunctor);                      \
+  __macro(square, SquareFunctor, SquareGradFunctor);             \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);      \
+  __macro(pow, PowFunctor, PowGradFunctor);                      \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor)
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class AddKernel : public framework::OpKernel {
+class AddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* input0 = context.Input<Tensor>("X");

--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -56,7 +56,7 @@ class ClipGradFunctor {
 };
 template <typename Place, typename T>
-class ClipKernel : public framework::OpKernel {
+class ClipKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");
@@ -73,7 +73,7 @@ class ClipKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class ClipGradKernel : public framework::OpKernel {
+class ClipGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");

--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ConcatKernel : public framework::OpKernel {
+class ConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -44,7 +44,7 @@ class ConcatKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class ConcatGradKernel : public framework::OpKernel {
+class ConcatGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -14,12 +14,7 @@ limitations under the License. */
 #include "paddle/operators/cond_op.h"
-#include <cstring>
-#include <sstream>
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/gather.h"
-#include "paddle/operators/net_op.h"
 #include "paddle/operators/scatter.h"
 namespace paddle {
@@ -31,142 +26,104 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
-void CondOp::CreateScope(const Scope& scope) const {
+framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
  auto sub_scopes_var = scope.FindVar("SubScopes");
  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
                          "Output(SubScopes) of CondOp should not be null.");
  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
  auto& sub_scope = scope.NewScope();
  sub_scopes->push_back(&sub_scope);
+  return sub_scope;
 }
-void CondOp::CreateIndexTensor(const Scope& scope) const {
+std::vector<framework::Scope*>& CondOp::GetSubScopes(
+    const framework::Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
+}
+LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
  auto index_tensors_var = scope.FindVar("IndexTensors");
  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                          "Output(IndexTensors) of CondOp should not be null.");
  auto& index_tensors =
      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
  index_tensors.push_back(LoDTensor());
+  return index_tensors.back();
 }
-void CondOp::InferShape(const Scope& scope) const {
+std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-  auto sub_scopes_var = scope.FindVar("SubScopes");
+    const framework::Scope& scope) const {
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  for (int i = 0; i < 2; ++i) {
-    // Create two sub scopes for true and false branches
-    // sub_scopes[0] for the true branch and sub_scopes[1] for the false
-    // branch
-    CreateScope(scope);
-    // Create two tensors for true and false indices
-    // index_tensors[0] for the true branch and index_tensors[1] for the false
-    // branch
-    CreateIndexTensor(scope);
-    PADDLE_ENFORCE(!Inputs("Xs").empty(),
-                   "Inputs(Xs) of CondOp can't be empty.");
-    for (auto& input : Inputs("Xs")) {
-      // Create a new tensor in sub-scope for input-type tensor
-      Variable* v = sub_scopes[i]->NewVar(input);
-      LoDTensor* sub_input = v->GetMutable<LoDTensor>();
-      sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
-    }
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
-      }
-    }
-    // each net calls InferShape
-    sub_net_op_[i]->InferShape(*sub_scopes[i]);
-  }
-  for (auto& output : Outputs("Outs")) {
-    LoDTensor* tensor_t_out =
-        sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    LoDTensor* tensor_f_out =
-        sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-    auto* tensor_out_var = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
-    LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-    // check output size should be same
-    PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
-                      "Outputs not of the same shape");
-    tensor_out->Resize(tensor_t_out->dims());
-    // tensor_out->mutable_data<float>(tensor_out->dims(),
-    // platform::CPUPlace());
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-}
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
-  auto* sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
  auto* index_tensors_var = scope.FindVar("IndexTensors");
  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                          "Output(IndexTensors) of CondOp should not be null.");
-  auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
+  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
+}
-  std::string cond_name = Input("Cond");
+void CondOp::PrepareDataForSubnet(
-  Variable* cond_var = scope.FindVar(cond_name);
+    const framework::Scope& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    // Create two sub scopes for true and false branches
+    //   sub_scopes[0] for the true branch
+    //   sub_scopes[1] for the false branch
+    AddSubScope(scope);
+    // Create two tensors for true and false indices:
+    //   index_tensors[0] for the true branch
+    //   index_tensors[1] for the false branch
+    AddIndexTensor(scope);
+  }
+  Variable* cond_var = scope.FindVar(Input("Cond"));
  PADDLE_ENFORCE_NOT_NULL(cond_var,
                          "Input(Cond) of CondOp should not be null.");
  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-  // Step 1: get the true/false index at runtime
+  // get the true/false index at runtime according to cond tensor
-  // index_[0]: vector<int>, contains all index for cond[i] == true
+  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_[1]: vector<int>, contains all index for cond[i] == false
+  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  for (int i = 0; i < 2; ++i) index_[i].clear();
+  std::vector<std::vector<int>> index_vectors;
+  index_vectors.resize(BRANCH_NUM);
  const int* cond_data = cond->data<int>();
  for (int i = 0; i < cond->dims()[0]; ++i) {
    if (cond_data[i])
-      index_[0].push_back(i);
+      index_vectors[TRUE_BRANCH].push_back(i);
    else
-      index_[1].push_back(i);
+      index_vectors[FALSE_BRANCH].push_back(i);
  }
-  // put index_[0] and index_[1] into two tensors:
+  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensor_[0] and index_tensor_[1]
+  // index_tensors[0] and index_tensors[1]
-  DDim dim = paddle::framework::make_ddim({0});
+  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  for (int i = 0; i < 2; ++i) {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-    dim[0] = index_[i].size();
-    int* tmp_ptr =
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
+    int* index_tensor_data_ptr =
        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    index_tensors[i].Resize(dim);
+    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-    memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+           dim[0] * sizeof(int));
  }
-  // Step 2: collect data by calling gather
+  // create input in subscopes according to index_vectors
-  for (int i = 0; i < 2; ++i) {
+  for (auto& input : Inputs("Xs")) {
-    // i= 0/i for True and False branches respectively
+    Variable* var_parent = scope.FindVar(input);
-    for (auto& input : Inputs("Xs")) {
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
-      // find Tensor
+    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-      Variable* v = scope.FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
-      v = sub_scopes[i]->FindVar(input);
+    for (int i = 0; i < BRANCH_NUM; ++i) {
-      PADDLE_ENFORCE_NOT_NULL(v);
+      Variable* var_child = sub_scopes[i]->FindVar(input);
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = var_child->GetMutable<LoDTensor>();
      // Resize child
-      DDim dim = tensor_child->dims();
+      DDim dim = tensor_parent->dims();
-      dim[0] = index_[i].size();
+      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->Resize(dim);
      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
@@ -174,32 +131,79 @@ void CondOp::Run(const Scope& scope,
    }
  }
-  // Step 3: run
+  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->NewVar(var_name);
+      }
+    }
  }
+}
-  // Step 4: merge output results
+void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
+                                 const platform::DeviceContext& dev_ctx) const {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  const std::vector<framework::LoDTensor>& index_tensors =
+      GetIndexTensors(scope);
+  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
  PADDLE_ENFORCE(!Outputs("Outs").empty(),
                 "Outputs(Outs) of CondOp can't be empty.");
-  for (int i = 0; i < 2; ++i) {
+  for (auto& output : Outputs("Outs")) {
-    // i= 0/i for True and False branches respectively
+    const LoDTensor* tensor_t_out =
-    for (auto& output : Outputs("Outs")) {
+        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-      // find Tensor
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-      Variable* v = scope.FindVar(output);
+    const LoDTensor* tensor_f_out =
-      PADDLE_ENFORCE_NOT_NULL(v);
+        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-      v = sub_scopes[i]->FindVar(output);
+    auto* var_out = scope.FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(v);
+    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+    DDim true_dim = tensor_t_out->dims();
+    DDim false_dim = tensor_f_out->dims();
+    true_dim[0] = 0;
+    false_dim[0] = 0;
+    PADDLE_ENFORCE_EQ(true_dim, false_dim,
+                      "Outputs not of the same shape except the first dim");
+    DDim out_dim = tensor_t_out->dims();
+    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
+    tensor_out->Resize(out_dim);
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+  // merge output results:
+  // output_tensor = true_output_tensor + false_output_tensor
+  for (auto& output : Outputs("Outs")) {
+    Variable* var_parent = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = &var_child->Get<LoDTensor>();
      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
                           tensor_parent);
    }
  }
 }
+void CondOp::Run(const Scope& scope,
+                 const platform::DeviceContext& dev_ctx) const {
+  PrepareDataForSubnet(scope, dev_ctx);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  }
+  MergeDataFromSubnet(scope, dev_ctx);
+}
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
 public:
  CondOpProtoAndCheckerMaker(framework::OpProto* proto,

--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -40,8 +40,7 @@ class CondOp : public framework::OperatorBase {
         const framework::VariableNameMap& outputs,
         const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {
-    index_.resize(2);
+    sub_net_op_.resize(BRANCH_NUM);
-    sub_net_op_.resize(2);
  }
  CondOp(const CondOp& o)
@@ -51,40 +50,44 @@ class CondOp : public framework::OperatorBase {
    PADDLE_THROW("Not implemented");
  }
-  void CreateScope(const framework::Scope& scope) const;
+  framework::Scope& AddSubScope(const framework::Scope& scope) const;
+  std::vector<framework::Scope*>& GetSubScopes(
+      const framework::Scope& scope) const;
-  void CreateIndexTensor(const framework::Scope& scope) const;
+  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
+  std::vector<framework::LoDTensor>& GetIndexTensors(
+      const framework::Scope& scope) const;
-  /*
+  void PrepareDataForSubnet(const framework::Scope& scope,
-   * InferShape must be called before Run.
+                            const platform::DeviceContext& dev_ctx) const;
-   */
+  void MergeDataFromSubnet(const framework::Scope& scope,
-  void InferShape(const framework::Scope& scope) const override;
+                           const platform::DeviceContext& dev_ctx) const;
  /*
   * Set True Block
   */
  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[0] = std::move(net);
+    sub_net_op_[TRUE_BRANCH] = std::move(net);
  }
  /*
   * Set False Block
   */
  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[1] = std::move(net);
+    sub_net_op_[FALSE_BRANCH] = std::move(net);
  }
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const override;
 private:
+  const int TRUE_BRANCH = 0;
+  const int FALSE_BRANCH = 1;
+  const int BRANCH_NUM = 2;
  // sub_net_op_[0]: subnet_t
  // sub_net_op_[1]: subnet_f
  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-  // index_[0]: True_index;
-  // index_[1]: False_index;
-  mutable std::vector<std::vector<int>> index_;
 };
 }  // namespace operators

--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class CosSimKernel : public framework::OpKernel {
+class CosSimKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    // get Tensor
@@ -67,7 +67,7 @@ class CosSimKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class CosSimGradKernel : public framework::OpKernel {
+class CosSimGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    // get Tensor

--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -27,7 +27,7 @@ using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 template <typename T>
-class CropKernel : public framework::OpKernel {
+class CropKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
@@ -69,7 +69,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
 }
 template <typename Place, typename T>
-class CropGradKernel : public framework::OpKernel {
+class CropGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    size_t rank =

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -47,6 +47,12 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Y", {x_dims[0], 1});
    ctx->ShareLoD("X", /*->*/ "Y");
  }
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 class CrossEntropyGradientOp : public framework::OperatorWithKernel {
@@ -87,6 +93,12 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
  }
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -18,14 +18,6 @@ namespace paddle {
 namespace operators {
 namespace {
-// TODO(qingqing): make zero setting a common function.
-template <typename T>
-__global__ void Zero(T* X, const int N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    X[i] = 0.0;
-  }
-}
 template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
@@ -53,7 +45,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
 }  // namespace
 template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -64,12 +56,12 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
    y->mutable_data<T>(ctx.GetPlace());
    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx, y, x, label, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, label, ctx.Attr<bool>("softLabel"));
  }
 };
 template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -99,11 +91,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
                              .stream()>>>(dx_data, dy_data, x_data, label_data,
                                           batch_size, class_num);
    } else {
-      Zero<T><<<grid, block, 0,
+      math::SetConstant<platform::GPUPlace, T>(ctx.device_context(), dx, 0);
-                reinterpret_cast<const platform::CUDADeviceContext&>(
-                    ctx.device_context())
-                    .stream()>>>(dx_data, batch_size * class_num);
      auto* label_data = label->data<int>();
      grid = (batch_size + block - 1) / block;
      CrossEntropyGradientKernel<T><<<

--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
-class CrossEntropyOpKernel : public framework::OpKernel {
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -37,12 +38,12 @@ class CrossEntropyOpKernel : public framework::OpKernel {
    y->mutable_data<T>(ctx.GetPlace());
    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx, y, x, labels, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, labels, ctx.Attr<bool>("softLabel"));
  }
 };
 template <typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel {
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -69,8 +70,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
      const T* x_data = x->data<T>();
      const int* label_data = label->data<int>();
-      // TODO(qingqing): make zero setting a common function.
+      math::SetConstant<platform::CPUPlace, T>(ctx.device_context(), dx, 0);
-      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
      for (int i = 0; i < batch_size; ++i) {
        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -47,7 +47,7 @@ struct MaskGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename Place, typename T, typename AttrType>
-class GPUDropoutKernel : public framework::OpKernel {
+class GPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T, typename AttrType>
-class CPUDropoutKernel : public framework::OpKernel {
+class CPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
@@ -62,7 +62,7 @@ class CPUDropoutKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class DropoutGradKernel : public framework::OpKernel {
+class DropoutGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(context.Attr<bool>("is_training"),

--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ElementwiseAddKernel : public framework::OpKernel {
+class ElementwiseAddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
@@ -101,7 +101,7 @@ struct ElementwiseAddBroadCast2GradFunctor {
 };
 template <typename Place, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel {
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,

--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ElementwiseDivKernel : public framework::OpKernel {
+class ElementwiseDivKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
@@ -103,7 +103,7 @@ struct ElementwiseDivBroadCast2GradFunctor {
 };
 template <typename Place, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel {
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,

--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -36,7 +36,9 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,9 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ElementwiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseMulBroadCast2GradFunctor {
 };
 template <typename Place, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel {
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,

--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ElementwiseSubKernel : public framework::OpKernel {
+class ElementwiseSubKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseSubBroadCast2GradFunctor {
 };
 template <typename Place, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel {
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,

--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -100,7 +100,7 @@ class FCOp : public NetOp {
      add_out = Output("AddOut");
      AppendOp(framework::OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
+          "elementwise_add", {{"X", {sum_out}}, {"Y", {Input("B")}}},
          {{"Out", {add_out}}}, {}));
    } else {
      if (Output("AddOut") != framework::kEmptyVarName) {

--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class FillZerosLikeKernel : public framework::OpKernel {
+class FillZerosLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* output = context.Output<framework::Tensor>("Y");

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -37,6 +37,11 @@ class GatherOp : public framework::OperatorWithKernel {
    output_dims[0] = batch_size;
    ctx->SetOutputDim("Out", output_dims);
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 class GatherGradOp : public framework::OperatorWithKernel {
@@ -47,6 +52,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContextBase* ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename Place, typename T>
-class GatherOpKernel : public framework::OpKernel {
+class GatherOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *X = ctx.Input<Tensor>("X");
@@ -37,7 +37,7 @@ class GatherOpKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class GatherGradientOpKernel : public framework::OpKernel {
+class GatherGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *Index = ctx.Input<Tensor>("Index");

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -16,7 +16,7 @@ namespace paddle {
 namespace operators {
 template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    float mean = context.Attr<float>("mean");
@@ -56,6 +56,11 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
                   "dims can be one int or array. dims must be set.");
    ctx->SetOutputDim("Out", framework::make_ddim(temp));
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
 };
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -76,6 +81,8 @@ Use to initialize tensor with gaussian random generator.
                 "Random seed of generator."
                 "0 means use system wide seed")
        .SetDefault(0);
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
  }
 };

--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -37,7 +37,7 @@ struct GaussianGenerator {
 };
 template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel {
+class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* tensor = context.Output<framework::Tensor>("Out");

--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -25,7 +25,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel {
+class GemmConv2DKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -98,7 +98,7 @@ class GemmConv2DKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel {
+class GemmConvGrad2DKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -36,6 +36,11 @@ class LookupTableOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
    ctx->ShareLoD("Ids", /*->*/ "Out");
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -69,6 +74,11 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
    auto table_dims = ctx->GetInputDim("W");
    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 }  // namespace operators

--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -61,7 +61,7 @@ __global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
 }
 template <typename T>
-class LookupTableCUDAKernel : public framework::OpKernel {
+class LookupTableCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto table_t = context.Input<Tensor>("W");
@@ -85,7 +85,7 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 };
 template <typename T>
-class LookupTableGradCUDAKernel : public framework::OpKernel {
+class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto ids_t = context.Input<Tensor>("Ids");

--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
-class LookupTableKernel : public framework::OpKernel {
+class LookupTableKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto table_t = context.Input<Tensor>("W");      // float tensor
@@ -44,7 +44,7 @@ class LookupTableKernel : public framework::OpKernel {
 };
 template <typename T>
-class LookupTableGradKernel : public framework::OpKernel {
+class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto ids_t = context.Input<Tensor>("Ids");

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -47,7 +47,6 @@ class LstmUnitOp : public framework::OperatorWithKernel {
  }
 };
-template <typename AttrType>
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  LstmUnitOpMaker(framework::OpProto* proto,
@@ -68,7 +67,7 @@ Equation:
  H = C * sigm(o)
 )DOC");
-    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+    AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
        .SetDefault(0.0);
  }
 };
@@ -93,9 +92,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            lstm_unit_grad, ops::LstmUnitGradOp);
+            ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -89,8 +89,8 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
  }
 }
-template <typename T, typename AttrType = T>
+template <typename T>
-class LstmUnitOpCUDAKernel : public framework::OpKernel {
+class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel {
    auto* c_tensor = ctx.Output<framework::Tensor>("C");
    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
    int b_size = c_tensor->dims()[0];
    int D = c_tensor->dims()[1];
@@ -120,8 +120,8 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel {
  }
 };
-template <typename T, typename AttrType = T>
+template <typename T>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
    int N = c_tensor->dims()[0];
    int D = c_tensor->dims()[1];
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
    int block = 512;
    int n = N * D;
@@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
+                       ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                       ops::LstmUnitGradOpCUDAKernel<double>);
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -32,8 +32,8 @@ inline T tanh(T x) {
  return 2. * sigmoid(2. * x) - 1.;
 }
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
-class LstmUnitKernel : public framework::OpKernel {
+class LstmUnitKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -44,7 +44,7 @@ class LstmUnitKernel : public framework::OpKernel {
    auto* c_tensor = ctx.Output<framework::Tensor>("C");
    auto* h_tensor = ctx.Output<framework::Tensor>("H");
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
    int b_size = c_tensor->dims()[0];
    int D = c_tensor->dims()[1];
@@ -75,8 +75,8 @@ class LstmUnitKernel : public framework::OpKernel {
  }
 };
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
-class LstmUnitGradKernel : public framework::OpKernel {
+class LstmUnitGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -108,7 +108,7 @@ class LstmUnitGradKernel : public framework::OpKernel {
    int N = c_tensor->dims()[0];
    int D = c_tensor->dims()[1];
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
    for (int n = 0; n < N; ++n) {
      for (int d = 0; d < D; ++d) {

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
-      im2col.cu DEPS cblas device_context operator)
+    nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
-    nv_library(softmax_function SRCS softmax.cc softmax.cu
+    nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
-      DEPS operator)
+    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
-    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
-      DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc
+    cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
-      DEPS cblas device_context operator)
+    cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
-    cc_library(softmax_function SRCS softmax.cc DEPS operator)
+    cc_library(softmax SRCS softmax.cc DEPS operator)
-    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
+    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
 endif()
-nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -26,8 +26,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 class CrossEntropyFunctor<platform::CPUPlace, T> {
 public:
-  void operator()(const framework::ExecutionContext& ctx,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
-                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel) {
    const int batch_size = prob->dims()[0];
    if (softLabel) {
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
      auto lbl = EigenMatrix<T>::From(*labels);
      auto loss = EigenMatrix<T>::From(*out);
-      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                .sum(Eigen::DSizes<int, 1>(1))
                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));

--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -74,8 +74,8 @@ using Tensor = framework::Tensor;
 template <typename T>
 class CrossEntropyFunctor<platform::GPUPlace, T> {
 public:
-  void operator()(const framework::ExecutionContext& ctx,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
-                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* prob,
                  const framework::Tensor* labels, bool softLabel) {
    const T* prob_data = prob->data<T>();
    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -87,20 +87,18 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
      const T* label_data = labels->data<T>();
      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
-      SoftCrossEntropyKernel<
+      SoftCrossEntropyKernel<T><<<
-          T><<<batch_size, block, block * sizeof(T),
+          batch_size, block, block * sizeof(T),
-               reinterpret_cast<const platform::CUDADeviceContext&>(
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-                   ctx.device_context())
+          loss_data, prob_data, label_data, class_num);
-                   .stream()>>>(loss_data, prob_data, label_data, class_num);
    } else {
      const int* label_data = labels->data<int>();
      int block = 512;
      int grid = (batch_size + block - 1) / block;
      CrossEntropyKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+          grid, block, 0,
-                              ctx.device_context())
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-                              .stream()>>>(loss_data, prob_data, label_data,
+          loss_data, prob_data, label_data, batch_size, class_num);
-                                           batch_size, class_num);
    }
  }
 };

--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -37,9 +37,7 @@ struct TolerableValue {
 template <typename Place, typename T>
 class CrossEntropyFunctor {
 public:
-  // (TODO caoying) it is much better to use DeviceContext as the first
+  void operator()(const platform::DeviceContext& context,
-  // parameter.
-  void operator()(const framework::ExecutionContext& context,
                  framework::Tensor* out, const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel);
 };

--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -52,6 +52,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 #include <cmath>
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
@@ -84,6 +85,13 @@ void matmul(const platform::DeviceContext& context,
            const framework::Tensor& matrix_b, bool trans_b, T alpha,
            framework::Tensor* matrix_out, T beta);
+template <typename Place, typename T>
+void SetConstant(const platform::DeviceContext& context,
+                 framework::Tensor* tensor, T num) {
+  auto t = framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -243,3 +243,24 @@ TEST(math_function, gemm_trans_clbas) {
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
 }
+TEST(math_function, zero) {
+  paddle::framework::Tensor tensor;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
+      context, &tensor, 0);
+  EXPECT_EQ(t[0], 0);
+  EXPECT_EQ(t[1], 0);
+  EXPECT_EQ(t[2], 0);
+  EXPECT_EQ(t[3], 0);
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
+      context, &tensor, 1);
+  EXPECT_EQ(t[0], 1);
+  EXPECT_EQ(t[1], 1);
+  EXPECT_EQ(t[2], 1);
+  EXPECT_EQ(t[3], 1);
+}
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/pooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             T* output_data, const int channels,
+                             const int input_height, const int input_width,
+                             const int output_height, const int output_width,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_height, const int stride_width,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+    if (maxIndex != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool2D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, input_channels,
+                              input_height, input_width, output_height,
+                              output_width, ksize_height, ksize_width,
+                              stride_height, stride_width, padding_height,
+                              padding_width, pool_process);
+  }
+};
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool2DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width, pool_process);
+  }
+};
+template <typename T>
+class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxPool2DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width);
+  }
+};
+template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::GPUPlace, double>; // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, T* output_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool3D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        pool_process);
+  }
+};
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelPool3DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width, pool_process);
+  }
+};
+template <class T>
+class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelMaxPool3DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width);
+  }
+};
+template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::GPUPlace, double>;  // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+namespace math {
+//////////////////////
+#define FLT_MAX __FLT_MAX__  //
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) {}
+};
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) { y /= poo_size; }
+};
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+template <typename Place, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+template <typename Place, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #include "paddle/operators/math/softmax.h"
@@ -19,6 +19,7 @@ namespace operators {
 namespace math {
 template class SoftmaxFunctor<platform::CPUPlace, float>;
+template class SoftmaxGradFunctor<platform::CPUPlace, float>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #define EIGEN_USE_GPU
@@ -21,6 +21,7 @@ namespace operators {
 namespace math {
 template class SoftmaxFunctor<platform::GPUPlace, float>;
+template class SoftmaxGradFunctor<platform::GPUPlace, float>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -36,7 +36,7 @@ struct ValueClip {
 template <typename Place, typename T>
 class SoftmaxFunctor {
 public:
-  void operator()(const framework::ExecutionContext& context,
+  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor* X, framework::Tensor* Y) {
    auto logits = EigenMatrix<T>::From(*X);
    auto softmax = EigenMatrix<T>::From(*Y);
@@ -58,8 +58,8 @@ class SoftmaxFunctor {
                               .broadcast(one_by_class))
                              .unaryExpr(ValueClip<T>());
-    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
-    softmax.device(context.GetEigenDevice<Place>()) =
+    softmax.device(*context.GetEigenDevice<Place>()) =
        (softmax *
         softmax.sum(along_class)
             .inverse()
@@ -68,6 +68,37 @@ class SoftmaxFunctor {
             .broadcast(one_by_class));
  }
 };
+template <typename Place, typename T>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<T>::From(*y);
+    auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+    auto logits_grad = EigenMatrix<T>::From(*x_grad);
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    auto dot = (softmax * softmax_grad)
+                   .sum(along_class)
+                   .eval()
+                   .reshape(batch_by_one)
+                   .broadcast(one_by_class);
+    logits_grad.device(*context.GetEigenDevice<Place>()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class MeanKernel : public framework::OpKernel {
+class MeanKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* input = context.Input<Tensor>("X");
@@ -45,7 +45,7 @@ class MeanKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class MeanGradKernel : public framework::OpKernel {
+class MeanGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class MinusKernel : public framework::OpKernel {
+class MinusKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* left_tensor = context.Input<framework::Tensor>("X");

--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -39,7 +39,7 @@ struct ModifiedHuberLossBackward {
 };
 template <typename T>
-class ModifiedHuberLossGradGPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("Y");

--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -47,7 +47,7 @@ struct ModifiedHuberLossForward {
 };
 template <typename Place, typename T>
-class ModifiedHuberLossKernel : public framework::OpKernel {
+class ModifiedHuberLossKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("X");
@@ -73,7 +73,7 @@ class ModifiedHuberLossKernel : public framework::OpKernel {
 // CPU backward kernel
 template <typename T>
-class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("Y");

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #include "paddle/operators/mul_op.h"
@@ -35,12 +35,14 @@ class MulOp : public framework::OperatorWithKernel {
    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
-    PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
+    PADDLE_ENFORCE_GT(
-                   "The rank of input tensor X should be larger than "
+        x_dims.size(), x_num_col_dims,
-                   "`mul_op`'s `x_num_col_dims`.");
+        "The input tensor X's rank of MulOp should be larger than "
-    PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
+        "x_num_col_dims.");
-                   "The rank of input tensor Y should be larger than "
+    PADDLE_ENFORCE_GT(
-                   "`mul_op`'s `y_num_col_dims`.");
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");
    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);

--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
+class MulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* x = context.Input<Tensor>("X");
@@ -52,7 +52,7 @@ class MulKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class MulGradKernel : public framework::OpKernel {
+class MulGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -50,6 +50,11 @@ class MultiplexOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("Out", in_dim);
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -99,6 +104,11 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 }  // namespace operators

--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel {
+class MultiplexGPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto ins = ctx.MultiInput<Tensor>("X");
@@ -51,7 +51,7 @@ class MultiplexGPUKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel {
+class MultiplexGradGPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class MultiplexCPUKernel : public framework::OpKernel {
+class MultiplexCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -48,7 +48,7 @@ class MultiplexCPUKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel {
+class MultiplexGradCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -53,16 +53,6 @@ class NetOp : public framework::OperatorBase {
    this->CompleteAddOp();
  }
-  /**
-   * Infer all the operators' input and output variables' shapes, will be called
-   * before every mini-batch
-   */
-  void InferShape(const framework::Scope& scope) const override {
-    for (auto& op : ops_) {
-      op->InferShape(scope);
-    }
-  }
  /**
   * @brief Run the network.
   *

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -7,14 +7,12 @@ namespace operators {
 using Scope = framework::Scope;
 using DeviceContext = platform::DeviceContext;
-static int infer_shape_cnt = 0;
 static int run_cnt = 0;
 class TestOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
  DEFINE_OP_CLONE_METHOD(TestOp);
-  void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;

--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -47,7 +47,7 @@ void PadFunction(const framework::ExecutionContext& context) {
 }
 template <typename Place, typename T>
-class PadKernel : public framework::OpKernel {
+class PadKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    int rank = context.Input<Tensor>("X")->dims().size();
@@ -97,7 +97,7 @@ void PadGradFunction(const framework::ExecutionContext& context) {
 }
 template <typename Place, typename T>
-class PadGradKernel : public framework::OpKernel {
+class PadGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    size_t rank =

--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/pool_op.h"
+namespace paddle {
+namespace operators {
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of Pooling should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
+                   "pooling_type should be 'max' or 'avg'");
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D");
+    if (ctx->Attrs().Get<bool>("globalPooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and Pooling size should be consistent.");
+    PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3,
+                   "Pooling size should be 2 elements. or 3 elements.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "paddings size and pooling size should be the same.");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input@Grad of Pooling should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCHW.");
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "Str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "Strides(height, width) of pooling operator."
+                              "Default {1,1}")
+        .SetDefault({1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>("paddings",
+                              "Paddings(height, width) of pooling operator."
+                              "Default {0,0}.")
+        .SetDefault({0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling2d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW. Where N is batch size, C is "
+             "the "
+             "number of channels, D, H and W is the depth, height and width of "
+             "feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCDHW.");
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "strides",
+        "Strides(depth, height, width) of pooling operator."
+        "Default {1,1,1}.")
+        .SetDefault({1, 1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "Paddings(depth, height, width) of pooling operator."
+        "Default {0,0,0}.")
+        .SetDefault({0, 0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling3d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+REGISTER_OP_CPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+REGISTER_OP_CPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/pool_op.cu
+++ b/paddle/operators/pool_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/pool_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+    }
+  }
+};
+template <typename Place, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+                pool2d_backward;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+                pool3d_backward;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -40,7 +40,7 @@ class PReluFunctor {
 };
 template <typename Place, typename T>
-class PReluKernel : public framework::OpKernel {
+class PReluKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
@@ -77,7 +77,7 @@ class PReluGradFunctor {
 };
 template <typename Place, typename T>
-class PReluGradKernel : public framework::OpKernel {
+class PReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));

--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class RankLossKernel : public framework::OpKernel {
+class RankLossKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* out_t = ctx.Output<framework::Tensor>("Out");
@@ -42,7 +42,7 @@ class RankLossKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class RankLossGradKernel : public framework::OpKernel {
+class RankLossGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* d_left_t =

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -28,29 +28,6 @@ using Variable = framework::Variable;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-void RecurrentAlgorithm::InferShape(const Scope& scope) const {
-  auto* input0 = scope.FindVar(arg_->inlinks[0]);
-  PADDLE_ENFORCE_NOT_NULL(input0);
-  seq_len_ = input0->GetMutable<LoDTensor>()->dims()[0];
-  PADDLE_ENFORCE_GT(seq_len_, 0);
-  CreateScopes(scope);
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
-  for (size_t i = 0; i < seq_len_; i++) {
-    if (i > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
-                        true /*infer_shape_mode*/);
-    }
-    (*stepnet_)->InferShape(*step_scopes[i]);
-  }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-}
 void RecurrentAlgorithm::Run(const Scope& scope,
                             const platform::DeviceContext& dev_ctx) const {
  auto step_scopes = GetStepScopes(scope);
@@ -202,24 +179,6 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
  }
 }
-void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
-  seq_len_ =
-      scope.FindVar(arg_->inlinks[0])->GetMutable<LoDTensor>()->dims()[0];
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
-    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-                        true /*infer_shape_mode*/);
-    }
-    (*stepnet_)->InferShape(*step_scopes[step_id]);
-  }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     true /*infer_shape_mode*/);
-  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
-}
 RecurrentGradientOp::RecurrentGradientOp(
    const std::string& type, const framework::VariableNameMap& inputs,
    const framework::VariableNameMap& outputs,

--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -41,11 +41,6 @@ class RecurrentAlgorithm {
    stepnet_ = stepnet;
  }
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const framework::Scope& scope) const;
 protected:
  /*
   * The step scopes will be stored in the father scope as a variable.
@@ -94,11 +89,6 @@ class RecurrentGradientAlgorithm {
  void LinkBootMemoryGradients(framework::Scope* step_scopes,
                               bool infer_shape_mode) const;
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const framework::Scope& scope) const;
 protected:
  inline const std::vector<framework::Scope*>& GetStepScopes(
      const framework::Scope& scope) const {
@@ -124,12 +114,6 @@ class RecurrentOp : public framework::OperatorBase {
    // TODO(yuyang18): Implement copy ctor well.
    PADDLE_THROW("Not implemented");
  }
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const framework::Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
@@ -163,13 +147,6 @@ class RecurrentGradientOp : public framework::OperatorBase {
    PADDLE_THROW("Not Implemented");
  }
-  /**
-   * InferShape must be called before Run.
-   */
-  void InferShape(const framework::Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    alg_.Run(scope, dev_ctx);

--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/reduce_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    auto dims_vector = vectorize(x_dims);
+    if (keep_dim || x_rank == 1) {
+      dims_vector[dim] = 1;
+    } else {
+      dims_vector.erase(dims_vector.begin() + dim);
+    }
+    auto out_dims = framework::make_ddim(dims_vector);
+    ctx->SetOutputDim("Out", out_dims);
+    if (dim != 0) {
+      // Only pass LoD when not reducing on the first dim.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor. Tensors with rank at most 6 are supported");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<int>(
+        "dim",
+        "(int, default 1) The dimension to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Noting that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    comment_ = R"DOC(
+{ReduceOP} operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless `keep_dim` is true.
+)DOC";
+    AddComment(comment_);
+  }
+ protected:
+  std::string comment_;
+  void Replace(std::string &src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+  void SetComment(std::string name, std::string op) {
+    Replace(comment_, "{ReduceOP}", name);
+    Replace(comment_, "{reduce}", op);
+  }
+};
+class ReduceSumOpMaker : public ReduceOpMaker {
+ public:
+  ReduceSumOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceSum", "sum");
+    AddComment(comment_);
+  }
+};
+class ReduceMeanOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMeanOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMean", "mean");
+    AddComment(comment_);
+  }
+};
+class ReduceMaxOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMaxOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMax", "max");
+    AddComment(comment_);
+  }
+};
+class ReduceMinOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMinOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMin", "min");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
+            ops::ReduceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum,
+    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
+                                             ops::SumGradFunctor>);
+REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+            reduce_mean_grad, ops::ReduceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    reduce_mean,
+    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
+                                             ops::MeanGradFunctor>);
+REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
+            ops::ReduceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max,
+    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_max_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
+                                             ops::MaxOrMinGradFunctor>);
+REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_min_grad,
+            ops::ReduceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min,
+    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_min_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
+                                             ops::MaxOrMinGradFunctor>);
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/reduce_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    reduce_sum,
+    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::SumFunctor>);
+REGISTER_OP_GPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
+                                             ops::SumGradFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    reduce_mean,
+    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MeanFunctor>);
+REGISTER_OP_GPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
+                                             ops::MeanGradFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    reduce_max,
+    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MaxFunctor>);
+REGISTER_OP_GPU_KERNEL(reduce_max_grad,
+                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
+                                             ops::MaxOrMinGradFunctor>);
+REGISTER_OP_GPU_KERNEL(
+    reduce_min,
+    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MinFunctor>);
+REGISTER_OP_GPU_KERNEL(reduce_min_grad,
+                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
+                                             ops::MaxOrMinGradFunctor>);
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+struct SumFunctor {
+  template <typename Place, typename X, typename Y, typename Dim>
+  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.sum(dim);
+  }
+};
+struct SumGradFunctor {
+  template <typename Place, typename X, typename Y, typename DX, typename DY,
+            typename Dim>
+  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim);
+  }
+};
+struct MeanFunctor {
+  template <typename Place, typename X, typename Y, typename Dim>
+  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.mean(dim);
+  }
+};
+struct MeanGradFunctor {
+  template <typename Place, typename X, typename Y, typename DX, typename DY,
+            typename Dim>
+  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+  }
+};
+struct MaxFunctor {
+  template <typename Place, typename X, typename Y, typename Dim>
+  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.maximum(dim);
+  }
+};
+struct MinFunctor {
+  template <typename Place, typename X, typename Y, typename Dim>
+  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.minimum(dim);
+  }
+};
+struct MaxOrMinGradFunctor {
+  template <typename Place, typename X, typename Y, typename DX, typename DY,
+            typename Dim>
+  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    auto equals = x == y.broadcast(dim);
+    auto ones = dx.constant(1);
+    auto zeros = dx.constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+template <typename Place, typename T, typename Functor>
+class ReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    int rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceCompute<1>(context);
+        break;
+      case 2:
+        ReduceCompute<2>(context);
+        break;
+      case 3:
+        ReduceCompute<3>(context);
+        break;
+      case 4:
+        ReduceCompute<4>(context);
+        break;
+      case 5:
+        ReduceCompute<5>(context);
+        break;
+      case 6:
+        ReduceCompute<6>(context);
+        break;
+    }
+  }
+ private:
+  template <size_t D>
+  void ReduceCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto x = EigenTensor<T, D>::From(*input);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    auto reduce_dim = Eigen::array<int, 1>({{dim}});
+    // construct the squeezed output tensor
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    DDim dims = output->dims();
+    auto dims_vector = vectorize(dims);
+    if (keep_dim && x_rank > 1) {
+      dims_vector.erase(dims_vector.begin() + dim);
+      dims = framework::make_ddim(dims_vector);
+    }
+    auto out = EigenTensor < T, D == 1 ? 1 : (D - 1) > ::From(*output, dims);
+    auto& place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, out, reduce_dim);
+  }
+};
+template <typename Place, typename T, typename Functor>
+class ReduceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    int rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      case 1:
+        ReduceGradCompute<1>(context);
+        break;
+      case 2:
+        ReduceGradCompute<2>(context);
+        break;
+      case 3:
+        ReduceGradCompute<3>(context);
+        break;
+      case 4:
+        ReduceGradCompute<4>(context);
+        break;
+      case 5:
+        ReduceGradCompute<5>(context);
+        break;
+      case 6:
+        ReduceGradCompute<6>(context);
+        break;
+    }
+  }
+ private:
+  template <size_t D>
+  void ReduceGradCompute(const framework::ExecutionContext& context) const {
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+    auto x = EigenTensor<T, D>::From(*input0);
+    auto x_grad = EigenTensor<T, D>::From(*output);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    DDim dims = input0->dims();
+    dims[dim] = 1;
+    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
+    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+    Eigen::array<int, D> braodcast_dim;
+    for (size_t i = 0; i < D; ++i) braodcast_dim[i] = 1;
+    braodcast_dim[dim] = input0->dims()[dim];
+    auto& place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, x_reduce, x_grad, x_reduce_grad, braodcast_dim,
+            braodcast_dim[dim]);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class ReshapeKernel : public framework::OpKernel {
+class ReshapeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* out = ctx.Output<framework::Tensor>("Out");
@@ -39,7 +39,7 @@ class ReshapeKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class ReshapeGradKernel : public framework::OpKernel {
+class ReshapeGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/operators/rowwise_add_op.h"
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-class RowwiseAddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("b"),
-                   "Input(b) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RowwiseAddOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-    int num_col_dims = x_dims.size() - b_dims.size();
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Out").size(), 1,
-                      "The output size must be 1");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RowwiseAddOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left input of row-wise add op, must be matrix");
-    AddInput("b", "The right input of row-wise add op, must be vector");
-    AddOutput("Out", "The output of row-wise add op");
-    AddComment(R"DOC(Row-wise Add operator
-for i in xrange(X.shape[0]):
-  Out = X[i] + b
-)DOC");
-  }
-};
-class RowwiseAddGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("b"), "b should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-    int64_t num_col_dims = x_dims.size() - b_dims.size();
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    auto x_grad_name = framework::GradVarName("X");
-    auto b_grad_name = framework::GradVarName("b");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(b_grad_name)) {
-      ctx->SetOutputDim(b_grad_name, b_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            rowwise_add_grad, ops::RowwiseAddGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename Place, typename T>
-class RowwiseAddKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-    auto input =
-        EigenMatrix<T>::Reshape(*context.Input<Tensor>("X"), num_col_dims);
-    auto bias = EigenVector<T>::Flatten(*context.Input<Tensor>("b"));
-    auto output = EigenMatrix<T>::Reshape(*out, num_col_dims);
-    const int bias_size = bias.dimension(0);
-    const int rest_size = input.size() / bias_size;
-    Eigen::DSizes<int, 1> one_d(input.size());
-    Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
-        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
-  }
-};
-template <typename Place, typename T>
-class RowwiseAddGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-    auto out_grad = EigenMatrix<T>::Reshape(*dout, num_col_dims);
-    auto place = context.GetEigenDevice<Place>();
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      EigenMatrix<T>::Reshape(*dx, num_col_dims).device(place) = out_grad;
-    }
-    if (db) {
-      db->mutable_data<T>(context.GetPlace());
-      // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-      // colwise add
-      Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
-      EigenVector<T>::Flatten(*db).device(place) = out_grad.sum(dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 template <typename Place, typename T, typename AttrType = T>
-class ScaleKernel : public framework::OpKernel {
+class ScaleKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& context) const {
    auto* tensor = context.Output<framework::Tensor>("Out");

--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -78,10 +78,6 @@ void ScatterUpdate(const platform::Place& place,
  for (int i = 1; i < src_dims.size(); i++)
    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
-  // slice size
-  size_t slice_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
  if (platform::is_cpu_place(place)) {
    CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
  } else {

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -48,6 +48,11 @@ class ScatterOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("Out", ref_dims);
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 class ScatterGradOp : public framework::OperatorWithKernel {
@@ -60,6 +65,11 @@ class ScatterGradOp : public framework::OperatorWithKernel {
                      ctx->GetInputDim("Updates"));
    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename Place, typename T>
-class ScatterOpKernel : public framework::OpKernel {
+class ScatterOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *Ref = ctx.Input<Tensor>("Ref");
@@ -40,7 +40,7 @@ class ScatterOpKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class ScatterGradientOpKernel : public framework::OpKernel {
+class ScatterGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -24,9 +24,9 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContextBase* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceAvgPoolOp should not be null.");
+                   "Input(X) of SequencePoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceAvgPoolOp should not be null.");
+                   "Output(Out) of SequencePoolOp should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
  }
 };

--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -38,7 +38,7 @@ enum SeqPoolType {
 };
 template <typename Place, typename T>
-class SequencePoolKernel : public framework::OpKernel {
+class SequencePoolKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>("X");
@@ -85,7 +85,7 @@ class SequencePoolKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class SequencePoolGradKernel : public framework::OpKernel {
+class SequencePoolGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>("X");

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/sequence_softmax_op.h"
+namespace paddle {
+namespace operators {
+class SequenceSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSoftmaxOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSoftmaxOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
+    AddComment(R"DOC(
+SequenceSoftmaxOp computes softmax activation among all time-steps for each
+sequence. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
+lengths.
+Equation:
+    for i-th sequence in a mini-batch:
+        Out(X[lod[i]:lod[i+1]], :) =
+            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
+and N turns out to be 7.
+)DOC");
+  }
+};
+class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+            ops::SequenceSoftmaxGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/sequence_softmax_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename Place, typename T>
+class SequenceSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto lod = x->lod();
+    auto dims = x->dims();
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice<T>(start_pos, end_pos);
+      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+    }
+  }
+};
+template <typename Place, typename T>
+class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice<T>(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice<T>(start_pos, end_pos);
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
+                                           &out_grad_i, &x_grad_i);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SGDOpKernel : public framework::OpKernel {
+class SGDOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto param = ctx.Input<Tensor>("param");

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Labels)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+class SigmoidCrossEntropyWithLogitsGradOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Labels");
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Labels)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
+                      "Input(Out@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
+                      "The 1st dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+class SigmoidCrossEntropyWithLogitsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
+                                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a tensor of logits computed by the previous "
+             " operator. Logits are unscaled log probabilities given as "
+             "log(p/(1-p)).");
+    AddInput("Labels",
+             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
+             "and shape as X. This input is a tensor of probabalistic labels "
+             "for each logit");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
+              " of elementwise logistic losses.");
+    AddComment(R"DOC(
+SigmoidCrossEntropyWithLogits Operator.
+This measures the elementwise probability error in discrete classification tasks
+in which each class is independent. This can be thought of as predicting labels
+for a data-point that are not mutually exclusive. For example, a news article
+can be about politics, technology or sports at the same time or none of these.
+The logistic loss is given as follows:
+       loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X))
+We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get
+       loss = X - X * Labels + log(1 + exp(-X))
+For stability and to prevent overflow of exp(-X) when X < 0,
+we can reformulate the loss as follows:
+       loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
+However the output only shares the LoD with input `X`.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid_cross_entropy_with_logits,
+            ops::SigmoidCrossEntropyWithLogitsOp,
+            ops::SigmoidCrossEntropyWithLogitsOpMaker,
+            sigmoid_cross_entropy_with_logits_grad,
+            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
+                           paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -13,11 +13,12 @@
   limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/operators/rowwise_add_op.h"
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits,
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
-REGISTER_OP_GPU_KERNEL(
+                           paddle::platform::GPUPlace, float>);
-    rowwise_add_grad,
+REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename Place, typename T>
+class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels =
+        context.Input<framework::Tensor>("Labels");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto place = context.GetEigenDevice<Place>();
+    // term1 = max(x, 0)
+    auto term1 = x.cwiseMax(static_cast<T>(0));
+    // term2 = x * labels
+    auto term2 = x * labels;
+    // term3 = log(1 + exp(-abs(x)))
+    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
+    out.device(place) = term1 - term2 + term3;
+  }
+};
+// dX = sigmoid(X) - labels
+template <typename Place, typename T>
+class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels =
+        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    dx.device(place) = dout * (sigmoid_x - labels);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -45,7 +45,7 @@ struct SmoothL1LossForward {
 };
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel {
+class SmoothL1LossKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("X");
@@ -115,7 +115,7 @@ struct SmoothL1LossBackward {
 };
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel {
+class SmoothL1LossGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("InsideWeight");

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -26,46 +26,31 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SoftmaxKernel : public framework::OpKernel {
+class SoftmaxKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto X = context.Input<Tensor>("X");
+    auto* X = context.Input<Tensor>("X");
-    auto Y = context.Output<Tensor>("Y");
+    auto* Y = context.Output<Tensor>("Y");
    // allocate memory on device.
    Y->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<Place, T>()(context, X, Y);
+    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
  }
 };
 template <typename Place, typename T>
-class SoftmaxGradKernel : public framework::OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto Y = context.Input<Tensor>("Y");
+    auto* Y = context.Input<Tensor>("Y");
-    auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dY = context.Input<Tensor>(framework::GradVarName("Y"));
-    auto dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-    const int batch_size = Y->dims()[0];
-    const int class_num = Y->dims()[1];
-    Eigen::DSizes<int, 1> along_class(1);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, class_num);
-    auto Y_eigen = EigenMatrix<T>::From(*Y);
+    // allocate memory on device.
-    auto dY_eigen = EigenMatrix<T>::From(*dY);
+    dX->mutable_data<T>(context.GetPlace());
-    auto dX_eigen = EigenMatrix<T>::From(*dX);
-    auto place = context.GetEigenDevice<Place>();
-    auto dot = (Y_eigen * dY_eigen)
+    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
-                   .sum(along_class)
-                   .eval()
-                   .reshape(batch_by_one)
-                   .broadcast(one_by_class);
-    dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
  }
 };

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -13,6 +13,7 @@
   limitations under the License. */
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
+#include <paddle/function/TensorType.h>
 namespace paddle {
 namespace operators {
@@ -115,6 +116,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
    ctx->ShareLoD("Logits", /*->*/ "Softmax");
    ctx->ShareLoD("Logits", /*->*/ "Loss");
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
+  }
 };
 class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
@@ -149,6 +155,12 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("Logits"),
                      ctx->GetInputDim("Softmax"));
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type());
+  }
 };
 }  // namespace operators

--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -53,7 +53,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
 }  // namespace
 template <typename T>
-class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
@@ -66,14 +66,16 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("softLabel"));
  }
 };
 template <typename T>
-class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),

--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
-class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
@@ -40,14 +40,16 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("softLabel"));
  }
 };
 template <typename T>
-class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* out_grad =

--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 template <typename Place, typename T>
-class SplitOpKernel : public framework::OpKernel {
+class SplitOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::Tensor>("X");

--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SquaredL2DistanceKernel : public framework::OpKernel {
+class SquaredL2DistanceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("X");
@@ -68,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class SquaredL2DistanceGradKernel : public framework::OpKernel {
+class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in0 = context.Input<Tensor>("sub_result");

--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -22,7 +22,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SumKernel : public framework::OpKernel {
+class SumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto ins = context.MultiInput<Tensor>("X");
@@ -43,7 +43,7 @@ class SumKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel {
+class SumGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* input = context.Input<Tensor>(framework::GradVarName("Out"));

--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -279,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int* indices,
 }
 template <typename T>
-class TopkOpCUDAKernel : public framework::OpKernel {
+class TopkOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),

--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class TopkKernel : public framework::OpKernel {
+class TopkKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    // Get the top k elements of each row of input tensor

--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -38,7 +38,7 @@ void EigenTranspose(const framework::ExecutionContext& context,
 }
 template <typename Place, typename T>
-class TransposeKernel : public framework::OpKernel {
+class TransposeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<framework::Tensor>("X");
@@ -73,7 +73,7 @@ class TransposeKernel : public framework::OpKernel {
 };
 template <typename Place, typename T>
-class TransposeGradKernel : public framework::OpKernel {
+class TransposeGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* out_grad =

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class CPUUniformRandomKernel : public framework::OpKernel {
+class CPUUniformRandomKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* tensor = ctx.Output<framework::Tensor>("Out");
@@ -62,6 +62,11 @@ class UniformRandomOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("Out", framework::make_ddim(temp));
  }
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
 };
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -80,6 +85,8 @@ Used to initialize tensor with uniform random generator.
                 "Random seed of uniform random. "
                 "0 means generate a seed by system")
        .SetDefault(0);
+    AddAttr<int>("data_type", "output tensor data type")
+        .SetDefault(framework::DataType::FP32);
  }
 };
 }  // namespace operators

--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -40,7 +40,7 @@ struct UniformGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class GPUUniformRandomKernel : public framework::OpKernel {
+class GPUUniformRandomKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* tensor = context.Output<framework::Tensor>("Out");

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -16,8 +16,8 @@ namespace paddle {
 namespace platform {
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
-    const {
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
@@ -37,6 +37,12 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 #ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice*
+DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+}
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -90,11 +96,6 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  mutable unsigned int* semaphore_;
 };
-template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
  SetDeviceId(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,13 +27,23 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+template <typename T>
+struct EigenDeviceConverter;
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
 class DeviceContext {
 public:
  virtual ~DeviceContext() {}
  virtual Place GetPlace() const = 0;
-  template <typename DeviceType>
+  template <typename PlaceType,
-  DeviceType* get_eigen_device() const;
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;
  virtual void Wait() const {}
 };
@@ -52,6 +62,11 @@ class CPUDeviceContext : public DeviceContext {
 };
 #ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
 class EigenCudaStreamDevice;
 class CUDADeviceContext : public DeviceContext {

--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -24,7 +24,7 @@ TEST(Device, Init) {
  for (int i = 0; i < count; i++) {
    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
    Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<Eigen::GpuDevice>();
+        device_context->template GetEigenDevice<GPUPlace>();
    ASSERT_NE(nullptr, gpu_device);
    delete device_context;
  }

--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
@@ -2,8 +2,10 @@
 #ifdef __CUDACC__
 #define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
 #define HOST __host__
 #else
 #define HOSTDEVICE
+#define DEVICE
 #define HOST
 #endif
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -47,7 +47,7 @@ bool is_cpu_place(const Place &p) {
 }
 bool places_are_same_class(const Place &p1, const Place &p2) {
-  return is_gpu_place(p1) == is_gpu_place(p2);
+  return p1.which() == p2.which();
 }
 std::ostream &operator<<(std::ostream &os, const Place &p) {

--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iostream>
 #include "paddle/platform/variant.h"
 namespace paddle {
@@ -46,8 +47,18 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
  bool operator()(const GPUPlace &gpu) const { return true; }
 };
+// Define the max number of Place in bit length. i.e., the max number of places
+// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
 typedef boost::variant<GPUPlace, CPUPlace> Place;
+// static check number of place types is less equal than
+// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+BOOST_MPL_ASSERT((boost::mpl::less_equal<
+                  Place::types::size,
+                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
 void set_place(const Place &);
 const Place &get_place();

--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -29,4 +29,6 @@
 #endif
 #endif
+#include <boost/mpl/comparison.hpp>
+#include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward
+    DEPS pybind python backward proto_desc
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
--- a/paddle/pybind/exception.h
+++ b/paddle/pybind/exception.h
@@ -13,6 +13,7 @@
   limitations under the License. */
 #pragma once
+#include <Python.h>
 #include "paddle/platform/enforce.h"
 #include "pybind11/pybind11.h"
 namespace paddle {

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 #include <deque>
 #include <iostream>
-#include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
 // Cast boost::variant for PyBind.
 // Copy from
@@ -93,383 +96,6 @@ namespace pybind {
 using namespace paddle::framework;  // NOLINT
-// convert between std::vector and protobuf repeated.
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T> &repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(repeated_field.begin(), repeated_field.end(),
-            std::back_inserter(ret));
-  return ret;
-}
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Reserve(vec.size());
-  for (const auto &elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-// Specialize vector<bool>.
-template <typename RepeatedField>
-inline void VectorToRepeated(const std::vector<bool> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Reserve(vec.size());
-  for (auto elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-class ProgramDescBind;
-class OpDescBind;
-class BlockDescBind;
-class VarDescBind;
-// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
-// read/write speed. Only when we want the protobuf message, the local changes
-// will be synchronized (by `Sync` method).
-class VarDescBind {
- public:
-  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
-  VarDesc *Proto() { return &desc_; }
-  py::bytes Name() const { return desc_.name(); }
-  void SetShape(const std::vector<int64_t> &dims) {
-    VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
-  }
-  void SetDataType(framework::DataType data_type) {
-    desc_.mutable_lod_tensor()->set_data_type(data_type);
-  }
-  std::vector<int64_t> Shape() const {
-    return RepeatedToVector(desc_.lod_tensor().dims());
-  }
-  framework::DataType DataType() const {
-    return desc_.lod_tensor().data_type();
-  }
- private:
-  VarDesc desc_;
-};
-class OpDescBind {
- public:
-  OpDesc *Proto() {
-    Sync();
-    return &op_desc_;
-  }
-  std::string Type() const { return op_desc_.type(); }
-  void SetType(const std::string &type) { op_desc_.set_type(type); }
-  const std::vector<std::string> &Input(const std::string &name) const {
-    auto it = inputs_.find(name);
-    PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s",
-                   name, Type());
-    return it->second;
-  }
-  std::vector<std::string> InputNames() const {
-    std::vector<std::string> retv;
-    retv.reserve(this->inputs_.size());
-    for (auto &ipt : this->inputs_) {
-      retv.push_back(ipt.first);
-    }
-    return retv;
-  }
-  void SetInput(const std::string &param_name,
-                const std::vector<std::string> &args) {
-    need_update_ = true;
-    inputs_[param_name] = args;
-  }
-  const std::vector<std::string> &Output(const std::string &name) const {
-    auto it = outputs_.find(name);
-    PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                   name, Type());
-    return it->second;
-  }
-  std::vector<std::string> OutputNames() const {
-    std::vector<std::string> retv;
-    retv.reserve(this->outputs_.size());
-    for (auto &ipt : this->outputs_) {
-      retv.push_back(ipt.first);
-    }
-    return retv;
-  }
-  void SetOutput(const std::string &param_name,
-                 const std::vector<std::string> &args) {
-    need_update_ = true;
-    this->outputs_[param_name] = args;
-  }
-  std::string DebugString() { return this->Proto()->DebugString(); }
-  bool HasAttr(const std::string &name) const {
-    return attrs_.find(name) != attrs_.end();
-  }
-  framework::AttrType GetAttrType(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return static_cast<framework::AttrType>(it->second.which() - 1);
-  }
-  std::vector<std::string> AttrNames() const {
-    std::vector<std::string> retv;
-    retv.reserve(attrs_.size());
-    for (auto &attr : attrs_) {
-      retv.push_back(attr.first);
-    }
-    return retv;
-  }
-  void SetAttr(const std::string &name, const Attribute &v) {
-    this->attrs_[name] = v;
-    need_update_ = true;
-  }
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
-  Attribute GetAttr(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return it->second;
-  }
-  int GetBlockAttr(const std::string &name) const {
-    auto it = attrs_.find(name);
-    PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-    return boost::get<BlockDesc *>(it->second)->idx();
-  }
- private:
-  struct SetAttrDescVisitor : public boost::static_visitor<void> {
-    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    mutable OpDesc::Attr *attr_;
-    void operator()(int v) const { attr_->set_i(v); }
-    void operator()(float v) const { attr_->set_f(v); }
-    void operator()(const std::string &v) const { attr_->set_s(v); }
-    void operator()(bool b) const { attr_->set_b(b); }
-    void operator()(const std::vector<int> &v) const {
-      VectorToRepeated(v, attr_->mutable_ints());
-    }
-    void operator()(const std::vector<float> &v) const {
-      VectorToRepeated(v, attr_->mutable_floats());
-    }
-    void operator()(const std::vector<std::string> &v) const {
-      VectorToRepeated(v, attr_->mutable_strings());
-    }
-    void operator()(const std::vector<bool> &v) const {
-      VectorToRepeated(v, attr_->mutable_bools());
-    }
-    void operator()(BlockDesc *desc) const {
-      attr_->set_block_idx(desc->idx());
-    }
-    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-  };
-  void Sync() {
-    if (need_update_) {
-      this->op_desc_.mutable_inputs()->Clear();
-      for (auto &ipt : inputs_) {
-        auto *input = op_desc_.add_inputs();
-        input->set_parameter(ipt.first);
-        VectorToRepeated(ipt.second, input->mutable_arguments());
-      }
-      this->op_desc_.mutable_outputs()->Clear();
-      for (auto &opt : outputs_) {
-        auto *output = op_desc_.add_outputs();
-        output->set_parameter(opt.first);
-        VectorToRepeated(opt.second, output->mutable_arguments());
-      }
-      this->op_desc_.mutable_attrs()->Clear();
-      for (auto &attr : attrs_) {
-        auto *attr_desc = op_desc_.add_attrs();
-        attr_desc->set_name(attr.first);
-        attr_desc->set_type(
-            static_cast<framework::AttrType>(attr.second.which() - 1));
-        boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
-      }
-      need_update_ = false;
-    }
-  }
-  OpDesc op_desc_;
-  std::unordered_map<std::string, std::vector<std::string>> inputs_;
-  std::unordered_map<std::string, std::vector<std::string>> outputs_;
-  std::unordered_map<std::string, Attribute> attrs_;
-  // need_update_ indicate there some local changes not be synchronized. If
-  // local changes should be synchronized, need_update_ should be set to true.
-  bool need_update_{false};
-};
-class BlockDescBind {
- public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
-      : prog_(prog), desc_(desc), need_update_(false) {}
-  BlockDescBind(const BlockDescBind &o) = delete;
-  BlockDescBind &operator=(const BlockDescBind &o) = delete;
-  int32_t ID() const { return desc_->idx(); }
-  int32_t Parent() const { return desc_->parent_idx(); }
-  VarDescBind *NewVar(py::bytes name_bytes) {
-    std::string name = name_bytes;
-    need_update_ = true;
-    auto it = vars_.find(name);
-    PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
-    auto var = new VarDescBind(name);
-    vars_[name].reset(var);
-    return var;
-  }
-  VarDescBind *Var(py::bytes name_bytes) const {
-    std::string name = name_bytes;
-    auto it = vars_.find(name);
-    PADDLE_ENFORCE(it != vars_.end(),
-                   "Can not find variable %s in current block.", name);
-    return it->second.get();
-  }
-  std::vector<VarDescBind *> AllVars() const {
-    std::vector<VarDescBind *> res;
-    for (const auto &p : vars_) {
-      res.push_back(p.second.get());
-    }
-    return res;
-  }
-  BlockDescBind *ParentBlock() const;
-  OpDescBind *AppendOp() {
-    need_update_ = true;
-    ops_.emplace_back(new OpDescBind());
-    return ops_.back().get();
-  }
-  OpDescBind *PrependOp() {
-    need_update_ = true;
-    ops_.emplace_front(new OpDescBind());
-    return ops_.front().get();
-  }
-  std::vector<OpDescBind *> AllOps() const {
-    std::vector<OpDescBind *> res;
-    for (const auto &op : ops_) {
-      res.push_back(op.get());
-    }
-    return res;
-  }
-  void Sync() {
-    if (need_update_) {
-      auto &op_field = *this->desc_->mutable_ops();
-      op_field.Clear();
-      op_field.Reserve(static_cast<int>(ops_.size()));
-      for (auto &op_desc : ops_) {
-        op_field.AddAllocated(op_desc->Proto());
-      }
-      need_update_ = false;
-    }
-  }
-  BlockDesc *RawPtr() { return desc_; }
- private:
-  ProgramDescBind *prog_;  // not_own
-  BlockDesc *desc_;        // not_own
-  bool need_update_;
-  std::deque<std::unique_ptr<OpDescBind>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
-};
-using ProgDescMap =
-    std::unordered_map<ProgramDesc *, std::unique_ptr<ProgramDescBind>>;
-static ProgDescMap *g_bind_map = nullptr;
-class ProgramDescBind {
- public:
-  static ProgramDescBind &Instance(ProgramDesc *prog) {
-    if (g_bind_map == nullptr) {
-      g_bind_map = new ProgDescMap();
-    }
-    auto &map = *g_bind_map;
-    auto &ptr = map[prog];
-    if (ptr == nullptr) {
-      ptr.reset(new ProgramDescBind(prog));
-    }
-    return *ptr;
-  }
-  ProgramDescBind(const ProgramDescBind &o) = delete;
-  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
-  BlockDescBind *AppendBlock(const BlockDescBind &parent) {
-    auto *b = prog_->add_blocks();
-    b->set_parent_idx(parent.ID());
-    b->set_idx(prog_->blocks_size() - 1);
-    blocks_.emplace_back(new BlockDescBind(this, b));
-    return blocks_.back().get();
-  }
-  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
-  std::string DebugString() { return Proto()->DebugString(); }
-  size_t Size() const { return blocks_.size(); }
-  ProgramDesc *Proto() {
-    for (auto &block : blocks_) {
-      block->Sync();
-    }
-    return prog_;
-  }
- private:
-  explicit ProgramDescBind(ProgramDesc *prog) : prog_(prog) {
-    for (auto &block : *prog->mutable_blocks()) {
-      blocks_.emplace_back(new BlockDescBind(this, &block));
-    }
-  }
-  // Not owned
-  ProgramDesc *prog_;
-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
-};
-BlockDescBind *BlockDescBind::ParentBlock() const {
-  if (this->desc_->parent_idx() == -1) {
-    return nullptr;
-  }
-  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
-}
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
-  BlockDesc *desc = block.RawPtr();
-  this->attrs_[name] = desc;
-}
 // Bind Methods
 void BindProgramDesc(py::module &m) {
  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
@@ -503,9 +129,18 @@ void BindBlockDesc(py::module &m) {
           py::return_value_policy::reference)
      .def("prepend_op", &BlockDescBind::PrependOp,
           py::return_value_policy::reference)
-      .def("new_var", &BlockDescBind::NewVar,
+      .def("new_var",
+           [](BlockDescBind &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.NewVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("var",
+           [](BlockDescBind &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.Var(name);
+           },
           py::return_value_policy::reference)
-      .def("var", &BlockDescBind::Var, py::return_value_policy::reference)
      .def("all_vars", &BlockDescBind::AllVars,
           py::return_value_policy::reference)
      .def("all_ops", &BlockDescBind::AllOps,
@@ -513,7 +148,7 @@ void BindBlockDesc(py::module &m) {
 }
 void BindVarDsec(py::module &m) {
-  py::enum_<framework::DataType>(m, "DataType", "")
+  py::enum_<DataType>(m, "DataType", "")
      .value("BOOL", DataType::BOOL)
      .value("INT16", DataType::INT16)
      .value("INT32", DataType::INT32)
@@ -523,15 +158,20 @@ void BindVarDsec(py::module &m) {
      .value("FP64", DataType::FP64);
  py::class_<VarDescBind>(m, "VarDesc", "")
-      .def("name", &VarDescBind::Name, py::return_value_policy::reference)
+      .def("name",
+           [](const VarDescBind &self) {
+             py::bytes name = self.Name();
+             return name;
+           },
+           py::return_value_policy::reference)
      .def("set_shape", &VarDescBind::SetShape)
      .def("set_data_type", &VarDescBind::SetDataType)
      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type", &VarDescBind::DataType);
+      .def("data_type", &VarDescBind::GetDataType);
 }
 void BindOpDesc(py::module &m) {
-  py::enum_<framework::AttrType>(m, "AttrType", "")
+  py::enum_<AttrType>(m, "AttrType", "")
      .value("INT", AttrType::INT)
      .value("INTS", AttrType::INTS)
      .value("FLOAT", AttrType::FLOAT)

--- a/paddle/pybind/protobuf.h
+++ b/paddle/pybind/protobuf.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <Python.h>
 #include <fstream>
 #include <vector>
-#include "paddle/framework/op_registry.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -77,20 +77,18 @@ PYBIND11_PLUGIN(core) {
           })
      .def("set", PyCPUTensorSetFromArray<float>)
      .def("set", PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<double>)
 #ifndef PADDLE_ONLY_CPU
      .def("set", PyCUDATensorSetFromArray<float>)
      .def("set", PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<double>)
 #endif
      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element",
+      .def("set_float_element", TensorSetElement<float>)
-           [](Tensor &self, size_t offset, float f) {
+      .def("get_float_element", TensorGetElement<float>)
-             // TODO(yuyang18): Only support GPU now.
+      .def("set_double_element", TensorSetElement<double>)
-             self.data<float>()[offset] = f;
+      .def("get_double_element", TensorGetElement<double>)
-           })
+      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
-      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
-        // TODO(yuyang18): Only support GPU now.
-        return self.data<float>()[offset];
-      });
  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
      .def_buffer(
@@ -230,7 +228,6 @@ All parameter, weight, gradient are variables in Paddle.
              const std::unordered_set<std::string> &no_grad_vars) {
             return Backward(forwardOp, no_grad_vars).release();
           })
-      .def("infer_shape", &OperatorBase::InferShape)
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::DeviceContext &dev_ctx) {

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -42,7 +42,7 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
      std::vector<size_t> strides;
@@ -56,13 +56,13 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        prod *= dims_outside[i - 1];
      }
      framework::Tensor dst_tensor;
-      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+      if (paddle::platform::is_gpu_place(tensor.place())) {
        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
-      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+      } else if (paddle::platform::is_cpu_place(tensor.place())) {
        dst_tensor = tensor;
      }
      return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
    } else {
@@ -73,10 +73,23 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 };
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
-  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  auto buffer_info =
+      details::CastToPyBufferImpl<true, 0, float, int, double>()(tensor);
  return buffer_info;
 }
+template <typename T>
+T TensorGetElement(framework::Tensor &self, size_t offset) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  return self.data<T>()[offset];
+}
+template <typename T>
+void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  self.data<T>()[offset] = elem;
+}
 template <typename T>
 void PyCPUTensorSetFromArray(
    framework::Tensor &self,

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,7 +18,7 @@ function version(){
        echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
        echo "    with_avx: @WITH_AVX@"
        echo "    with_gpu: @WITH_GPU@"
-        echo "    with_mkldnn: @WITH_MKLDNN"
+        echo "    with_mkldnn: @WITH_MKLDNN@"
        echo "    with_mklml: @WITH_MKLML@"
        echo "    with_double: @WITH_DOUBLE@"
        echo "    with_python: @WITH_PYTHON@"

--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
 import unittest
 import numpy as np
+import random
 import itertools
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
@@ -12,17 +13,19 @@ def grad_var_name(var_name):
 def create_op(scope, op_type, inputs, outputs, attrs):
    kwargs = dict()
+    def __create_var__(name, var_name):
+        scope.new_var(var_name)
+        kwargs[name].append(var_name)
    for in_name, in_dup in Operator.get_op_inputs(op_type):
        if in_name in inputs:
            kwargs[in_name] = []
            if in_dup:
                sub_in = inputs[in_name]
                for sub_in_name, _ in sub_in:
-                    var = scope.new_var(sub_in_name)
+                    __create_var__(in_name, sub_in_name)
-                    kwargs[in_name].append(sub_in_name)
            else:
-                var = scope.new_var(in_name)
+                __create_var__(in_name, in_name)
-                kwargs[in_name].append(in_name)
    for out_name, out_dup in Operator.get_op_outputs(op_type):
        if out_name in outputs:
@@ -30,11 +33,9 @@ def create_op(scope, op_type, inputs, outputs, attrs):
            if out_dup:
                sub_out = outputs[out_name]
                for sub_out_name, _ in sub_out:
-                    var = scope.new_var(sub_out_name)
+                    __create_var__(out_name, sub_out_name)
-                    kwargs[out_name].append(sub_out_name)
            else:
-                var = scope.new_var(out_name)
+                __create_var__(out_name, out_name)
-                kwargs[out_name].append(out_name)
    for attr_name in Operator.get_op_attr_names(op_type):
        if attr_name in attrs:
@@ -44,49 +45,46 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        tensor = scope.find_var(var_name).get_tensor()
+        if isinstance(var, tuple):
+            tensor.set_lod(var[1])
+            var = var[0]
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
    for in_name, in_dup in Operator.get_op_inputs(op.type()):
        if in_name in inputs:
            if in_dup:
                sub_in = inputs[in_name]
                for sub_in_name, sub_in_val in sub_in:
-                    var = scope.find_var(sub_in_name)
+                    __set_input__(sub_in_name, sub_in_val)
-                    tensor = var.get_tensor()
-                    sub_in_array = sub_in_val[0] \
-                        if isinstance(sub_in_val, tuple) else sub_in_val
-                    tensor.set_dims(sub_in_array.shape)
-                    tensor.set(sub_in_array, place)
-                    if isinstance(sub_in_val, tuple):
-                        tensor.set_lod(sub_in_val[1])
            else:
-                var = scope.find_var(in_name)
+                __set_input__(in_name, inputs[in_name])
-                tensor = var.get_tensor()
-                in_val = inputs[in_name]
-                in_array = in_val[0] if isinstance(in_val, tuple) else in_val
-                tensor.set_dims(in_array.shape)
-                tensor.set(in_array, place)
-                if isinstance(in_val, tuple):
-                    tensor.set_lod(in_val[1])
 def set_output_grad(scope, op, outputs, place):
+    def __set_tensor__(name):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if out_dtype == core.DataType.FP64:
+            data = np.ones(out_tensor.shape(), dtype=np.float64)
+        elif out_dtype == core.DataType.FP32:
+            data = np.ones(out_tensor.shape(), dtype=np.float32)
+        else:
+            raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
    for out_name, out_dup in Operator.get_op_outputs(op.type()):
        if out_name in outputs:
            if out_dup:
                sub_out = outputs[out_name]
                for sub_out_name, _ in sub_out:
-                    out_tensor = scope.find_var(sub_out_name).get_tensor()
+                    __set_tensor__(sub_out_name)
-                    grad_tensor = scope.new_var(grad_var_name(
-                        sub_out_name)).get_tensor()
-                    grad_tensor.set_dims(out_tensor.shape())
-                    data = np.ones(out_tensor.shape(), dtype=np.float32)
-                    grad_tensor.set(data, place)
            else:
-                out_tensor = scope.find_var(out_name).get_tensor()
+                __set_tensor__(out_name)
-                grad_tensor = scope.new_var(grad_var_name(out_name)).get_tensor(
-                )
-                grad_tensor.set_dims(out_tensor.shape())
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-                grad_tensor.set(data, place)
 def get_numeric_gradient(scope,
@@ -96,9 +94,7 @@ def get_numeric_gradient(scope,
                         output_names,
                         delta=0.005,
                         in_place=False):
    set_input(scope, op, inputs, core.CPUPlace())
-    op.infer_shape(scope)
    tensor_to_check = scope.find_var(input_to_check).get_tensor()
@@ -116,7 +112,29 @@ def get_numeric_gradient(scope,
    tensor_to_check = scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.get_dims())
-    gradient_flat = np.zeros(shape=(tensor_size, ), dtype='float32')
+    tensor_to_check_dtype = tensor_to_check.dtype()
+    if tensor_to_check_dtype == core.DataType.FP32:
+        tensor_to_check_dtype = np.float32
+    elif tensor_to_check_dtype == core.DataType.FP64:
+        tensor_to_check_dtype = np.float64
+    else:
+        raise ValueError("Not supported data type " + str(
+            tensor_to_check_dtype))
+    gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
+    def __get_elem__(tensor, i):
+        if tensor_to_check_dtype == np.float32:
+            return tensor.get_float_element(i)
+        else:
+            return tensor.get_double_element(i)
+    def __set_elem__(tensor, i, e):
+        if tensor_to_check_dtype == np.float32:
+            tensor.set_float_element(i, e)
+        else:
+            tensor.set_double_element(i, e)
    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
    for i in xrange(tensor_size):
@@ -124,20 +142,20 @@ def get_numeric_gradient(scope,
            set_input(scope, op, inputs, core.CPUPlace())
        # get one input element throw it's index i.
-        origin = tensor_to_check.get_float_element(i)
+        origin = __get_elem__(tensor_to_check, i)
        # add delta to it, run op and then get the sum of the result tensor.
        x_pos = origin + delta
-        tensor_to_check.set_float_element(i, x_pos)
+        __set_elem__(tensor_to_check, i, x_pos)
        y_pos = get_output()
        if in_place:
            set_input(scope, op, inputs, core.CPUPlace())
        x_neg = origin - delta
-        tensor_to_check.set_float_element(i, x_neg)
+        __set_elem__(tensor_to_check, i, x_neg)
        y_neg = get_output()
-        tensor_to_check.set_float_element(i, origin)
+        __set_elem__(tensor_to_check, i, origin)
        gradient_flat[i] = (y_pos - y_neg) / delta / 2
    return gradient_flat.reshape(tensor_to_check.get_dims())
@@ -160,7 +178,6 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
    set_input(scope, op, inputs, place)
-    op.infer_shape(scope)
    op.run(scope, ctx)
    if no_grad_set is None:
@@ -169,7 +186,6 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
    backward_op = get_backward_op(scope, op, no_grad_set)
    set_output_grad(scope, op, outputs, place)
-    backward_op.infer_shape(scope)
    backward_op.run(scope, ctx)
    out = np.array(scope.find_var(grad_name).get_tensor())
@@ -177,6 +193,21 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 class OpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        '''Fix random seeds to remove randomness from tests'''
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        np.random.seed(123)
+        random.seed(124)
+    @classmethod
+    def tearDownClass(cls):
+        '''Restore random seeds'''
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
    def check_output_with_place(self, place, atol):
        self.scope = core.Scope()
        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
@@ -187,7 +218,6 @@ class OpTest(unittest.TestCase):
        if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
            return
        set_input(self.scope, self.op, self.inputs, place)
-        self.op.infer_shape(self.scope)
        ctx = core.DeviceContext.create(place)
        self.op.run(self.scope, ctx)

--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -219,5 +219,22 @@ class TestSTanh(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+class TestSoftsign(OpTest):
+    def setUp(self):
+        self.op_type = "softsign"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {
+            'Y': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -66,7 +66,6 @@ class TestCondOp(unittest.TestCase):
        self.create_cond_op()
        self.create_sub_net()
        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.condop.infer_shape(self.scope)
        self.condop.run(self.scope, ctx)
        return np.array(self.scope.find_var("Out").get_tensor())

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -80,7 +80,7 @@ class TestCrossEntropyOp3(OpTest):
        cross_entropy2 = (-label * np.log(X)).sum(
            axis=1, keepdims=True).astype("float32")
-        self.inputs = {"X": X, "Label": label}
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
        self.outputs = {"Y": cross_entropy}
        self.attrs = {"softLabel": True}

--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -7,8 +7,8 @@ class ElementwiseMulOp(OpTest):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
        }
        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
@@ -16,23 +16,21 @@ class ElementwiseMulOp(OpTest):
        self.check_output()
    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
+        self.check_grad(['X', 'Y'], 'Out')
    def test_check_grad_ingore_x(self):
-        self.check_grad(
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
    def test_check_grad_ingore_y(self):
-        self.check_grad(
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
+            'X': np.random.random((32, )).astype("float64"),
-            'Y': np.random.random((32, )).astype("float32")
+            'Y': np.random.random((32, )).astype("float64")
        }
        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
@@ -41,8 +39,8 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(2).astype(np.float32)
+            'Y': np.random.rand(2).astype(np.float64)
        }
        self.attrs = {'axis': 0}
@@ -55,8 +53,8 @@ class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(3).astype(np.float32)
+            'Y': np.random.rand(3).astype(np.float64)
        }
        self.attrs = {'axis': 1}
@@ -69,8 +67,8 @@ class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(4).astype(np.float32)
+            'Y': np.random.rand(4).astype(np.float64)
        }
        self.outputs = {
@@ -82,8 +80,8 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
    def setUp(self):
        self.op_type = "elementwise_mul"
        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
-            'Y': np.random.rand(3, 4).astype(np.float32)
+            'Y': np.random.rand(3, 4).astype(np.float64)
        }
        self.attrs = {'axis': 1}

--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -24,7 +24,6 @@ class TestGaussianRandomOp(unittest.TestCase):
            std=1.,
            seed=10)
-        op.infer_shape(scope)
        context = core.DeviceContext.create(place)
        op.run(scope, context)
        tensor = numpy.array(scope.find_var('Out').get_tensor())

--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -14,8 +14,8 @@ def tanh_np(x):
 class LstmUnitTest(OpTest):
    def setUp(self):
        self.op_type = "lstm_unit"
-        x_np = np.random.normal(size=(5, 16)).astype("float32")
+        x_np = np.random.normal(size=(5, 16)).astype("float64")
-        c_np = np.random.normal(size=(5, 4)).astype("float32")
+        c_np = np.random.normal(size=(5, 4)).astype("float64")
        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
        forget_bias_np = 0.
        self.attrs = {'forget_bias': 0.}
@@ -31,7 +31,7 @@ class LstmUnitTest(OpTest):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01)
+        self.check_grad(['X', 'C_prev'], ['C', 'H'])
 if __name__ == "__main__":

--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
@@ -2,6 +2,9 @@ import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
 import numpy
 import paddle.v2 as paddle
+exit(
+    0
+)  # FIXME(yuyang18): InferShape has been removed, this unittest should be changed until compile time is ready
 BATCH_SIZE = 100

--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+class TestPool2d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+        self.outputs = {'Out': output}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+class TestCase1(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+class TestCase2(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+class TestCase3(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+class TestCase4(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+class TestCase5(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
+                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+    return out
+class TestPool3d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+        self.outputs = {'Out': output}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+class TestCase1(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+class TestCase2(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+class TestCase3(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+class TestCase4(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+class TestCase5(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
@@ -17,7 +17,7 @@ class PReluTest(OpTest):
        x_np_sign = np.sign(x_np)
        x_np = x_np_sign * np.maximum(x_np, .005)
-        alpha_np = np.array([.1])
+        alpha_np = np.array([.1], dtype="float32")
        self.inputs = {'X': x_np, 'Alpha': alpha_np}
        out_np = np.maximum(self.inputs['X'], 0.)
        out_np = out_np + np.minimum(self.inputs['X'],

--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -101,7 +101,6 @@ class RecurrentOpTest(unittest.TestCase):
        self.create_rnn_op()
        self.create_step_net()
        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.rnnop.infer_shape(self.scope)
        self.rnnop.run(self.scope, ctx)
        return np.array(self.scope.find_var("h@mem").get_tensor())
@@ -198,4 +197,7 @@ class RecurrentGradientOpTest(unittest.TestCase):
 if __name__ == '__main__':
+    exit(
+        0
+    )  # FIXME(yuyang18): InferShape has been removed, this unittest may error
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'dim': 1}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestMaxOp(OpTest):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': -1}
+        self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
+    def test_check_output(self):
+        self.check_output()
+class TestMinOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': 2}
+        self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
+    def test_check_output(self):
+        self.check_output()
+class TestKeepDimReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': -2, 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class Test1DReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random(20).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
-import unittest
-import numpy as np
-from op_test import OpTest
-class TestRowwiseAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [10]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-    def test_check_grad_ingore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-    def test_check_grad_ingore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-class TestRowwiseAddOp2(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 2, 5]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [2, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-    def test_check_grad_ignore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-    def test_check_grad_ignore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+class TestSequenceSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_softmax"
+        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
+        lod = [[0, 4, 5, 8, 11]]
+        out = np.zeros((11, 1)).astype("float32")
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+            sub_out = stable_softmax(sub_x)
+            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
+                lod[0][i + 1] - lod[0][i], 1)
+        self.inputs = {"X": (x, lod)}
+        self.outputs = {"Out": out}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.01)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+import numpy as np
+from op_test import OpTest
+from scipy.special import logit
+from scipy.special import expit
+class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
+    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
+    '''
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
+    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
+    '''
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            .astype("float32")
+        }
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -43,7 +43,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
    def setUp(self):
        self.op_type = "softmax_with_cross_entropy"
        batch_size = 2
-        class_num = 17
+        class_num = 37
        logits = np.random.uniform(0.1, 1.0,
                                   [batch_size, class_num]).astype("float32")

--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -24,7 +24,6 @@ class TestUniformRandomOp(unittest.TestCase):
            max=10.0,
            seed=10)
-        op.infer_shape(scope)
        ctx = core.DeviceContext.create(place)
        op.run(scope, ctx)
        tensor = numpy.array(scope.find_var('X').get_tensor())

--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -96,6 +96,9 @@ class Inference(object):
            for i, item in enumerate(result):
                retv[i].append(item)
+        if retv == None:
+            return []
        if flatten_result:
            retv = [numpy.concatenate(out) for out in retv]