Merge the update in profiling tool

770aff2c · Yibing Liu · 24458ae3 · a893f156 · 770aff2c · 770aff2c
167 changed file
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -93,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 | MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
 | MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |

+- Alexnet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  |    |   |   |   |    |
+| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
+| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+chart TBD

 ### Laptop
 TBD
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -19,7 +19,11 @@ args = {
    'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)

 settings(
    batch_size=batch_size,

--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,15 +8,19 @@ function clock_to_seconds() {
 }

 function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
  topology=$1
  layer_num=$2
  bs=$3
-  thread=`nproc`
-  if [ $thread -gt $bs ]; then
-    thread=$bs
+  trainers=`nproc`
+  if [ $trainers -gt $bs ]; then
+    trainers=$bs
  fi
-  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+  threads=$((`nproc` / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  export OPENBLAS_NUM_THREADS=$threads

  models_in="models/${topology}-${layer_num}/pass-00000/"
  if [ ! -d $models_in ]; then
@@ -28,7 +32,7 @@ function infer() {
    --config="${topology}.py" \
    --use_mkldnn=False \
    --use_gpu=False \
-    --trainer_count=$thread \
+    --trainer_count=$trainers \
    --log_period=$log_period \
    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \

--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
 set -e

 function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  export OPENBLAS_NUM_THREADS=1
  topology=$1
  layer_num=$2
  bs=$3

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -252,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
    :noindex:

+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
+
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -68,12 +68,6 @@ scale
    :noindex:


-reshape
---------
-..  autofunction:: paddle.v2.fluid.layers.reshape
-    :noindex:
-
-
 transpose
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.transpose

--- a/doc/design/backward.md
+++ b/doc/design/backward.md
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
--- a/paddle/framework/images/duplicate_op.graffle
+++ b/paddle/framework/images/duplicate_op.graffle
--- a/paddle/framework/images/duplicate_op.png
+++ b/paddle/framework/images/duplicate_op.png
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/doc/design/images/profiler.png
+++ b/doc/design/images/profiler.png
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
    def minimize(self, loss, parameter_list):
        """Add operations to minimize `loss` by updating `parameter_list`.

-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
        `create_optimization_pass()` into one.
        """
        params_grads = self.create_backward_pass(loss, parameter_list)

--- a/doc/design/profiler.md
+++ b/doc/design/profiler.md
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="./images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range. 
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+
+```c++
+enum ProfilerState {
+  kDisabled, 
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -15,7 +15,7 @@
 获取PaddlePaddle的Docker镜像
 ------------------------------

-执行下面的命令获取最新的PaddlePaddle Docker镜像
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：

  .. code-block:: bash

@@ -27,7 +27,7 @@

     docker pull docker.paddlepaddle.org/paddle

-下载GPU版本的Docker镜像：
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：

  .. code-block:: bash

@@ -54,7 +54,7 @@
 .. _docker_run:

 在Docker中执行PaddlePaddle训练程序
------------------------------
+----------------------------------

 假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
 `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
@@ -82,7 +82,7 @@
 .. _docker_run_book:

 使用Docker启动PaddlePaddle Book教程
------------------------------
+-----------------------------------

 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
 Pull PaddlePaddle Docker Image
 ------------------------------

-Run the following command to download the latest Docker images:
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:

  .. code-block:: bash

@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:

     docker pull docker.paddlepaddle.org/paddle

-Download GPU version images:
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:

  .. code-block:: bash

@@ -58,7 +58,7 @@ and run:
 .. _docker_run:

 Launch your training program in Docker
------------------------------
+--------------------------------------

 Assume that you have already written a PaddlePaddle program
 named :code:`train.py` under directory :code:`/home/work` (refer to 

--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
 ------------------------------


-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。

  .. code-block:: bash

     pip install paddlepaddle


-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：

  .. code-block:: bash


--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -12,14 +12,14 @@ Install Using pip
 ------------------------------

 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements.
+machine, it will also download requirements, the version is cpu_avx_openblas.

  .. code-block:: bash

     pip install paddlepaddle


-If you wish to install GPU version, just run:
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:

  .. code-block:: bash


--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,13 +7,13 @@
 ++++++++

 PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装：
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：

  .. code-block:: bash

     pip install paddlepaddle

-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：

  .. code-block:: bash


--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -8,13 +8,13 @@ Quick Install

 You can use pip to install PaddlePaddle with a single command, supports
 CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install:
+Simply run the following command to install, the version is cpu_avx_openblas:

  .. code-block:: bash

     pip install paddlepaddle

-If you need to install GPU version, run:
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:

  .. code-block:: bash


--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)

-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+if (WITH_GPU)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+else()
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+endif ()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+if (WITH_GPU)
+  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
+else()
+  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+endif()

 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

@@ -21,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)

-cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
@@ -37,7 +45,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.

--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
-# Operator/expression 's Backward
-
-## Motivation
-
-In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass. 
-
-## Implementation
-
-In this design doc, we exported only one API for generating the backward pass.
-
-```c++
-std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-```
-
-The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
-
-### Backward Operator Registry
-
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
-
-|                        | forward operator | backward operator 
-| ---------------------- | ---------------- |------------------------- |		
-| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
-| **Operator::outputs_** | Outputs          | InputGradients            |
-
- In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
-
-For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
-
-```cpp
-REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
-```
-
-`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
-
-`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
-
-### Backward Opeartor Creating
-
-Given a certain forward operator, we can get its corresponding backward operator by calling:
-
-```cpp
-OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-```
-
-The function `BuildGradOp` will sequentially execute following processes:
-
-1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-
-2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
-
-3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
-
-4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
-
-### Backward Network Building
-
-A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
-
-1. Op 
-
-   When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
-
-2. NetOp 
-
-   In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
-
-3. RnnOp
-
-   RnnOp is a nested stepnet operator.  Backward module needs to recusively call `Backward` for every stepnet.
-
-4. Sharing Variables
-
-   As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable. 
-
-<p align="center">
-<img src="./images/duplicate_op.png" width="50%" ><br/>
-
-	Figure 1. Sharing variables in operators. 
-
-</p>
-
-	Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting. 
-
-<p align="center">
-<img src="images/duplicate_op2.png" width="40%" ><br/>
-
-	Figure 2. Replace sharing variable's gradient with `Add` operator.
-
-</p>
-
-	Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction. 
-
-5. Part of the Gradient is Zero.
-
-   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
-
-
-Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <functional>

 #include "paddle/framework/data_transform.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {
@@ -22,5 +25,111 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
  return data_transform_map;
 }

+auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNCHW, LibraryType::kPlain);
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+                 "TransDataLayout only support Datatype are same!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  auto place = kernel_pair.second.place_;
+  CopyFrom(src, place, *ctx, dst);
+
+  auto src_dim = src.dims();
+  std::vector<int64_t> dst_dim;
+
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  dst->Resize(make_ddim(dst_dim));
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
 }  // namespace framework
 }  // namespace paddle
+
+namespace f = paddle::framework;
+
+namespace {
+std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
+std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
+}
+
+REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
+                           std::bind(f::TransDataLayout, NHWC2NCHW,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
+REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
+                           std::bind(f::TransDataLayout, NCHW2NHWC,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -21,17 +21,20 @@ limitations under the License. */
 #include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"

 namespace paddle {
 namespace framework {

-using DataTransformFN =
-    std::function<void(const std::vector<platform::DeviceContext*> ctx,
-                       const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;

+using DataTransformFn =
+    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
+                       const Variable&, Variable*)>;
+
 struct KernelTypePairHash {
  static void HashCombine(const OpKernelType& t, std::size_t* seed) {
    OpKernelType::Hash kernel_type_hasher;
@@ -46,8 +49,69 @@ struct KernelTypePairHash {
  }
 };

+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
 using DataTransformMap =
-    std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>;
+    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;

 class DataTransformFnMap {
 public:
@@ -58,25 +122,25 @@ class DataTransformFnMap {
  }

  void Insert(const OpKernelType& left, const OpKernelType& right,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
    Insert(std::make_pair(left, right), data_tranform_fn);
  }

  void Insert(const KernelTypePair& kernel_type_pair,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
    PADDLE_ENFORCE(!Has(kernel_type_pair),
                   "KernelTypePair %s has been registered", "");
    map_.insert({kernel_type_pair, data_tranform_fn});
  }

-  const DataTransformFN& Get(const KernelTypePair& key_pair) const {
+  const DataTransformFn& Get(const KernelTypePair& key_pair) const {
    auto data_transformer = GetNullable(key_pair);
    PADDLE_ENFORCE_NOT_NULL(data_transformer,
-                            "DataTransformFN should not be NULL");
+                            "DataTransformFn should not be NULL");
    return *data_transformer;
  }

-  const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const {
+  const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
    auto it = map_.find(key_pair);
    if (it == map_.end()) {
      return nullptr;

--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -11,36 +11,67 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <array>
+#include <vector>

-#include "paddle/framework/data_transform.h"
 #include <gtest/gtest.h>

+#include "paddle/framework/data_transform.h"
+#include "paddle/platform/device_context.h"
+
 namespace paddle {
 namespace framework {
-
 using namespace platform;

+/**
+ * @brief cross validation of different kernel type transform
+ *  We use four bit map represent different combination.
+ *  If the field has multiple possible value, only choose two of them.
+ *  For DataType, only test the FP32(float), FP64(double).
+ *  e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
+ *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
+ */
+
+std::array<proto::DataType, 2> kDataType = {
+    {proto::DataType::FP32, proto::DataType::FP64}};
+
+std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+
+std::array<DataLayout, 2> kDataLayout = {{
+    DataLayout::kNHWC, DataLayout::kNCHW,
+}};
+
+std::array<LibraryType, 2> kLibraryType = {{
+    LibraryType::kPlain, LibraryType::kMKLDNN,
+}};
+
+OpKernelType GenFromBit(const std::vector<bool> bits) {
+  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
+                      kLibraryType[bits[3]]);
+}
+
 int test_value = 0;

-OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                           LibraryType::kCUDNN);
-OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0),
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
-OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
+auto kernel0 = GenFromBit({0, 0, 0, 0});
+auto kernel1 = GenFromBit({0, 0, 0, 1});
+auto kernel2 = GenFromBit({0, 0, 1, 0});
+auto kernel3 = GenFromBit({0, 0, 1, 1});

-void type1_to_type2(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransDataType_t(const platform::DeviceContext* ctx,
+                     const KernelTypePair& p, const Variable& in,
+                     Variable* out) {
  test_value++;
 }

-void type2_to_type3(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransDataLayout_t(const platform::DeviceContext* ctx,
+                       const KernelTypePair& p, const Variable& in,
+                       Variable* out) {
  test_value--;
 }

-void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransLibraryType_t(const platform::DeviceContext* ctx,
+                        const KernelTypePair& p, const Variable& in,
+                        Variable* out) {
  test_value += 2;
 }

@@ -49,30 +80,89 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,

 namespace frw = paddle::framework;

-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2,
-                           frw::type1_to_type2);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3,
-                           frw::type2_to_type3);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
-                           frw::type1_to_type3);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);

 TEST(DataTransform, Register) {
  using namespace paddle::framework;
  using namespace paddle::platform;

  auto& instance = DataTransformFnMap::Instance();
-  ASSERT_EQ(instance.Map().size(), 3UL);
-  std::vector<DeviceContext*> ctx;
  paddle::framework::Variable in;
  paddle::framework::Variable out;

-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in,
-                                                                       &out);
+  DeviceContext* ctx = new CPUDeviceContext();
+  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
+  instance.Get(pair0)(ctx, pair0, in, &out);
  ASSERT_EQ(test_value, 1);
-  instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+
+  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
+  instance.Get(pair1)(ctx, pair1, in, &out);
  ASSERT_EQ(test_value, 0);
-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+
+  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
+  instance.Get(pair3)(ctx, pair3, in, &out);
  ASSERT_EQ(test_value, 2);
 }
+
+TEST(DataTransform, DataLayout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
+  src->set_layout(DataLayout::kNHWC);
+
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 1, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+
+  Tensor dst = out.Get<Tensor>();
+
+  EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 1, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, out, &in);
+  }
+
+  EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
+}
+
+TEST(DataTransform, DataType) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  {
+    auto kernel1 = GenFromBit({0, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.data<double>() != nullptr);
+}
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,18 +14,17 @@ limitations under the License. */

 #include "paddle/framework/executor.h"

-#include <algorithm>
-#include <iostream>
-#include <memory>
 #include <set>
-#include <vector>

+#include "gflags/gflags.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/scope.h"
+
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");

 namespace paddle {
 namespace framework {
@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  }
 }

+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  // TODO(tonyyang-svail):
@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugString();
    op->Run(*local_scope, place_);
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
        }
-  if (create_local_scope) {
+      }
+    }
+  }
+  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
 }

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
    places.emplace_back(platform::CPUPlace());
    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
  }
-  platform::DeviceContextPool::Create(places);
+  platform::DeviceContextPool::Init(places);
  return true;
 }


--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@@ -20,7 +20,11 @@ namespace framework {
 // For more details about the design of LibraryType, Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library

-enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 };
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};

 inline std::string LibraryTypeToString(const LibraryType& library_type) {
  switch (library_type) {
@@ -31,7 +35,26 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
    case LibraryType::kCUDNN:
      return "CUDNN";
    default:
-      PADDLE_THROW("unknown LibraryType %d", library_type);
+      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
+  }
+}
+
+inline LibraryType StringToLibraryType(const char* ctype) {
+  std::string s(ctype);
+  if (s == std::string("PLAIN")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("MKLDNN")) {
+    return LibraryType::kMKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return LibraryType::kCUDNN;
+    // To be compatible with register macro.
+    // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("CPU")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("CUDA")) {
+    return LibraryType::kPlain;
+  } else {
+    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
  }
 }


--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -189,58 +189,12 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {

 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                       const platform::DeviceContext &dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
-  {  // the 1st field, uint32_t version
+  {  // the 1st field, uint32_t version for LoDTensor
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
  }
-  {  // the 2nd field, tensor description
-     // int32_t  size
-     // void*    protobuf message
-    proto::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
-    auto dims = framework::vectorize(tensor.dims());
-    auto *pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto *data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto &gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void *>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char *>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  {  // the 4th field, lod information
+  {
+    // the 2st field, LoD information
    // uint64_t lod_level
    // uint64_t lod_level_1 size in byte.
    // int*     lod_level_1 data
@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
               static_cast<std::streamsize>(size));
    }
  }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }

 void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
    uint32_t version;
    is.read(reinterpret_cast<char *>(&version), sizeof(version));
    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::TensorDesc desc;
-  {  // int32_t size
-     // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char *>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-
-    void *buf;
-    platform::Place cpu = platform::CPUPlace();
-    switch (desc.data_type()) {
-      case proto::FP32:
-        buf = tensor->mutable_data<float>(cpu);
-        break;
-      case proto::FP64:
-        buf = tensor->mutable_data<double>(cpu);
-        break;
-      case proto::INT32:
-        buf = tensor->mutable_data<int>(cpu);
-        break;
-      case proto::INT64:
-        buf = tensor->mutable_data<int64_t>(cpu);
-        break;
-      default:
-        PADDLE_THROW("DataType %d not supported", desc.data_type());
-    }
-    is.read(static_cast<char *>(buf), tensor->memory_size());
  }
-  {  // read lod
+  {
+    // the 2st field, LoD information
    uint64_t lod_level;
    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
    auto &lod = *tensor->mutable_lod();
@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
      lod[i] = tmp;
    }
  }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor));
 }

 }  // namespace framework

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
  EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
 }

+TEST_F(LoDTensorTester, SerializeAndDeserialize) {
+  LoDTensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  std::ostringstream oss;
+  SerializeToStream(oss, lod_tensor_, cpu_ctx);
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+  float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < kLodTensorSize; ++i) {
+    EXPECT_EQ(dst_ptr[i], i);
+  }
+  EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
+}
+
 TEST(LodExpand, test) {
  LoD lod{{0, 2}};
  LoDTensor tensor;

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }

+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
 OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
    : desc_(desc), need_update_(false) {
  // restore inputs_
@@ -252,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(int v) const { attr_->set_i(v); }
  void operator()(float v) const { attr_->set_f(v); }
  void operator()(const std::string &v) const { attr_->set_s(v); }
-  void operator()(bool b) const { attr_->set_b(b); }
+
+  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
+  template <class T,
+            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  void operator()(T b) const {
+    attr_->set_b(b);
+  }

  void operator()(const std::vector<int> &v) const {
    VectorToRepeated(v, attr_->mutable_ints());
@@ -266,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
-  void operator()(proto::BlockDesc *desc) const {
-    attr_->set_block_idx(desc->idx());
-  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };


--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -35,6 +35,8 @@ class OpDesc {

  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);

+  void CopyFrom(const OpDesc &op_desc);
+
  proto::OpDesc *Proto();

  std::string Type() const { return desc_.type(); }

--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -68,6 +68,8 @@ struct OpKernelType {
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
           library_type_ == o.library_type_;
  }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
 };

 inline std::ostream& operator<<(std::ostream& os,
@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
  return os;
 }

+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                              LibraryType::kCUDNN);

-  std::ostringstream stream;
-  stream << op_kernel_type;
  ASSERT_EQ(
-      stream.str(),
+      paddle::framework::KernelTypeToString(op_kernel_type),
      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
 }


--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -37,8 +37,8 @@ class Registrar {
 public:
  // In our design, various kinds of classes, e.g., operators and kernels,
  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
  // be removed from the generated binary file by the linker. To avoid such
  // removal, we add Touch to all registrar classes and make USE_OP macros to
  // call this method. So, as long as the callee code calls USE_OP, the global
@@ -79,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  using KERNEL_TYPE =
      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;

-  void operator()(const char* op_type) const {
+  void operator()(const char* op_type, const char* library_type) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);

    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
        func;
-    func(op_type);
+    func(op_type, library_type);
  }
 };

 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type) const {}
+  void operator()(const char* op_type, const char* library_type) const {}
 };

 // User can register many kernel in one place. The data type could be different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
-  explicit OpKernelRegistrar(const char* op_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type);
+    func(op_type, library_type);
  }
 };

@@ -181,7 +182,8 @@ class OpKernelRegistrar : public Registrar {
      __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
      "REGISTER_OP_KERNEL must be called in global namespace");           \
  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type,       \
+                                                          #DEVICE_TYPE);  \
  int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
    __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
    return 0;                                                             \

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>

@@ -182,3 +196,71 @@ TEST(OperatorRegistrar, Test) {
  using namespace paddle::framework;
  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
 }
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetActualKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+#endif
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const Scope& scope_;
 };

+const platform::DeviceContext* GetDeviceContext(
+    framework::KernelTypePair& kernel_pair) {
+  auto& actual_kernel_key = kernel_pair.first;
+  auto& expected_kernel_key = kernel_pair.second;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(actual_kernel_key.place_) &&
+      platform::is_cpu_place(expected_kernel_key.place_)) {
+    return pool.Get(actual_kernel_key.place_);
+  } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
+             platform::is_gpu_place(expected_kernel_key.place_)) {
+    return pool.Get(expected_kernel_key.place_);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
 void OperatorWithKernel::Run(const Scope& scope,
                             const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto dev_ctx = pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);

  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
  }

  if (actual_kernel_key == expected_kernel_key) {
-    kernel_iter->second->Compute(ctx);
+    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
+                      "Currently, model parallelism is only supported between "
+                      "CPU and other devices. For example, multi-GPU model "
+                      "parallelism will failed.");
  } else {
-    Scope& op_scope = scope.NewScope();
+    auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
+    const DataTransformFn* trans_fun =
+        DataTransformFnMap::Instance().GetNullable(kernel_pair);
+    if (trans_fun) {
      auto input_vars = this->InputVars();
+      // TODO(qijun) filter the input vars that do not need to be transformed
+
+      // filter vars that has been transformed
+      std::vector<std::string> need_trans;
      for (auto var_name : input_vars) {
-      op_scope.Var(var_name);
+        auto var_name_trans =
+            var_name + framework::KernelTypeToString(expected_kernel_key);
+        if (!scope.FindVar(var_name_trans)) {
+          const_cast<Scope&>(scope).Var(var_name_trans);
+          need_trans.push_back(var_name);
+        }
      }

-    // TODO(qijun) get appropriate DeviceContext from DeviceContext pool
-    platform::DeviceContext* trans_dev_ctx = nullptr;
-    std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx};
-
-    // TODO(qijun) get appropriate DataTransformFN from global map
-    framework::DataTransformFN trans_fun = nullptr;
+      if (!need_trans.empty()) {
+        auto trans_dev_ctx = GetDeviceContext(kernel_pair);

        // Wait for transform starting
        dev_ctx->Wait();

-    for (auto var_name : input_vars) {
-      trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)),
-                op_scope.FindVar(var_name));
+        for (auto var_name : need_trans) {
+          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
+                       scope.FindVar(var_name + framework::KernelTypeToString(
+                                                    expected_kernel_key)));
        }
        // Wait for data transform finishing
-    for (auto ctx : trans_dev_ctx_vec) {
-      ctx->Wait();
+        trans_dev_ctx->Wait();
      }
-
-    // Create a new ExecutionContext
-    ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
-    kernel_iter->second->Compute(op_ctx);
    }
+  }
+
+  kernel_iter->second->Compute(ctx);
 }

 OpKernelType OperatorWithKernel::GetActualKernelType(

--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -12,5 +12,58 @@ limitations under the License. */
 #include "paddle/framework/selected_rows.h"

 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
+  auto tensor = *selected_rows->mutable_value();
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, &tensor);
+}
+
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -59,5 +59,14 @@ class SelectedRows {
  int64_t height_;
 };

+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
 }

+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -178,7 +178,7 @@ class Tensor {
  DDim dims_;

  /**
-   * @brief the layout of memory block, default is NCHW.
+   * @brief the layout of memory block, default is NHWC.
   *
   * @note the memory allocation order, describe how weight/data is stored
   *       For example, in 4-D Tensor(rank=4), there are three commonly

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,12 +15,13 @@
 #include <gtest/gtest.h>
 #include <string>

+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
 TEST(Tensor, Dims) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor tt;
+  framework::Tensor tt;
  tt.Resize({2, 3, 4});
-  DDim dims = tt.dims();
+  framework::DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
    EXPECT_EQ(i + 2, dims[i]);
@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
 }

 TEST(Tensor, DataAssert) {
-  paddle::framework::Tensor src_tensor;
+  framework::Tensor src_tensor;

  bool caught = false;
  try {
    src_tensor.data<double>();
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (platform::EnforceNotMet err) {
    caught = true;
    std::string msg =
        "holder_ should not be null\nTensor holds no memory. Call "
@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
   because Memory::Alloc() and Memory::Free() have not been ready.
 */
 TEST(Tensor, MutableData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
    float* p1 = nullptr;
    float* p2 = nullptr;
    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1, p2);
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
  }

 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
    float* p1 = nullptr;
    float* p2 = nullptr;
    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1, p2);
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
    EXPECT_EQ(p1, p2);
  }
 #endif
 }

 TEST(Tensor, ShareDataWith) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
    }
    ASSERT_TRUE(caught);

-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }

 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
 }

 TEST(Tensor, Slice) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
-    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
+                                 platform::CPUPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 3);
    EXPECT_EQ(slice_dims[0], 2);
    EXPECT_EQ(slice_dims[1], 3);
@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
    uintptr_t src_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {

 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::CUDAPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
    EXPECT_EQ(slice_dims[0], 4);
    EXPECT_EQ(slice_dims[1], 9);

    uintptr_t src_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
 }

 TEST(Tensor, ReshapeToMatrix) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
+  framework::Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
    src_ptr[i] = i;
  }
-  Tensor res = ReshapeToMatrix(src, 2);
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
  ASSERT_EQ(res.dims()[0], 2 * 3);
  ASSERT_EQ(res.dims()[1], 4 * 9);
 }

 TEST(Tensor, Layout) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  ASSERT_EQ(src.layout(), DataLayout::kNHWC);
-  src.set_layout(DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
+  framework::Tensor src;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  src.set_layout(framework::DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/framework/tensor_util.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    CopyFrom(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor_util.cu
+++ b/paddle/framework/tensor_util.cu
+./tensor_util.cc
\ No newline at end of file
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {
@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
               src_ptr, size);
 }

+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void* buf;
+    platform::Place cpu = platform::CPUPlace();
+    // TODO(Yancey1989): use VisiterDataType instead of DataType switch
+    switch (desc.data_type()) {
+      case proto::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case proto::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case proto::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case proto::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char*>(buf), tensor->memory_size());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -13,6 +13,7 @@

 #include "paddle/framework/tensor_util.h"
 #include <gtest/gtest.h>
+#include <cmath>
 #include <string>

 namespace paddle {
@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
 #endif
 }

+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+
+  ASSERT_TRUE(HasNAN(src));
+}
+
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+
+    delete gpu_place;
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util_test.cu
+++ b/paddle/framework/tensor_util_test.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <condition_variable>
 #include <functional>
+#include <future>
 #include <mutex>
 #include <queue>
 #include <thread>
@@ -25,10 +26,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-typedef std::function<void()> Task;
-
 class ThreadPool {
 public:
+  typedef std::packaged_task<void()> Task;
+  typedef std::function<void()> Fun;
+
  /**
   * @brief   Get a instance of threadpool, the thread number will
   *          be specified as the number of hardware thread contexts
@@ -61,13 +63,18 @@ class ThreadPool {
  /**
   * @brief   Push a function to the queue, and will be scheduled and
   *          executed if a thread is available.
-   * @param[in] Task  will be pushed to the task queue.
+   * @param[in] Task, will be pushed to the task queue.
+   * @return    std::future<void>, we could wait for the task finished by
+   *            f.wait().
   */
-  void Run(const Task& fn) {
+  std::future<void> Run(const Fun& fn) {
    std::unique_lock<std::mutex> lock(mutex_);
-    tasks_.push(fn);
+    Task task(std::bind(fn));
+    std::future<void> f = task.get_future();
+    tasks_.push(std::move(task));
    lock.unlock();
    scheduled_.notify_one();
+    return f;
  }

  /**
@@ -110,7 +117,7 @@ class ThreadPool {
        break;
      }
      // pop a task from the task queue
-      auto task = tasks_.front();
+      auto task = std::move(tasks_.front());
      tasks_.pop();

      --available_;

--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -20,16 +20,21 @@ limitations under the License. */
 namespace framework = paddle::framework;

 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  std::vector<std::future<void>> fs;
  for (int i = 0; i < cnt; ++i) {
-    pool->Run([&sum]() { sum.fetch_add(1); });
+    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
+    fs.push_back(std::move(f));
+  }
+  for (auto& f : fs) {
+    f.wait();
  }
 }

 TEST(ThreadPool, ConcurrentInit) {
  framework::ThreadPool* pool;
-  int concurrent_cnt = 50;
+  int n = 50;
  std::vector<std::thread> threads;
-  for (int i = 0; i < concurrent_cnt; ++i) {
+  for (int i = 0; i < n; ++i) {
    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
    threads.push_back(std::move(t));
  }
@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
  }
 }

-TEST(ThreadPool, ConcurrentStart) {
+TEST(ThreadPool, ConcurrentRun) {
  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
  std::atomic<int> sum(0);
  std::vector<std::thread> threads;
-  int concurrent_cnt = 50;
+  int n = 50;
  // sum = (n * (n + 1)) / 2
-  for (int i = 1; i <= concurrent_cnt; ++i) {
+  for (int i = 1; i <= n; ++i) {
    std::thread t(do_sum, pool, std::ref(sum), i);
    threads.push_back(std::move(t));
  }
@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
    t.join();
  }
  pool->Wait();
-  EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
+  EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
  }
 }


--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,14 +126,165 @@ public:
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
+  }
+};
+
 #ifdef PADDLE_MOBILE_INFERENCE
-    if (Device == DEVICE_TYPE_CPU) {
-      memory_.reset();
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
  }
-#endif
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+
+              // gemm
+              int M = outputChannels / groups_;
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+
+    memory_.reset();
  }
 };

+#endif
+
 /*
 * \brief Backward input calculation of convolution.
 */
@@ -348,7 +499,11 @@ public:
  }
 };

+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,54 @@ public:
                  int dilationWidth = 1);
 };

+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace paddle
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }

 #endif

+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          0,
+                          height,
+                          0,
+                          width);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
 }  // namespace paddle
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -34,6 +34,16 @@ else()
    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
 endif()

+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
 if(NOT WITH_GPU)
    list(REMOVE_ITEM GSERVER_HEADER
        layers/CudnnConvBaseLayer.h

--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
  }

  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1UL);
+  CHECK_EQ(config_.inputs_size(), 1);
  const NormConfig& conf = config_.inputs(0).norm_conf();
  localSize_ = conf.size();
  alpha_ = conf.scale();

--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLPackedRecurrentLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
+
+bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
+  }
+  return true;
+}
+
+void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batchValue = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr preBatchValue =
+            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
+
+        packed_weight_->gemm_compute(preBatchValue, batchValue);
+      }
+      Argument arg;
+      arg.value = batchValue;
+      activation_->forward(arg).check();
+    }
+  }
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
+                                            size_t numSequences,
+                                            const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr batchValue =
+          batchValue_->getBatchValue(n, batchGrad->getHeight());
+
+      Argument arg;
+      arg.value = batchValue;
+      arg.grad = batchGrad;
+      activation_->backward(arg).check();
+
+      if (n != 0) {
+        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
+        packed_weightT_->gemm_compute(batchGrad, batchValue);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batchValue =
+              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
+          weight_->getWGrad()->mul(
+              *batchValue->getTranspose(), *batchGrad, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      weight_->getWGrad()->mul(
+          *output_.value
+               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
+               ->getTranspose(),
+          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
+                                   len - 1),
+          1,
+          1);
+    }
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
+ * More details:
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
+ */
+
+class MKLPackedRecurrentLayer : public RecurrentLayer {
+public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
+
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
+
+protected:
+  /// packed_weight_ contains same data with
+  /// RecurrentLayer::weight_ but is packed
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
+#include "RecurrentLayer.h"

 DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");

 namespace paddle {

-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
 REGISTER_LAYER(recurrent, RecurrentLayer);

 bool RecurrentLayer::init(const LayerMap& layerMap,
@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
    bias_->getWGrad()->collectBias(*output_.grad, 1);
    bias_->getParameterPtr()->incUpdate(callback);
  }
-
  weight_->getParameterPtr()->incUpdate(callback);
 }


--- a/paddle/gserver/layers/RecurrentLayer.h
+++ b/paddle/gserver/layers/RecurrentLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
    for (auto reversed : {false, true}) {
      config.layerConfig.set_reversed(reversed);
      config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
    }
  }
 }
@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
    for (auto reversed : {false, true}) {
      config.layerConfig.set_reversed(reversed);
      config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
    }
  }
  for (auto useGpu : {true}) {

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
 #include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) {
  }
 }

+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 int main(int argc, char** argv) {
-  if (version::isWithGpu()) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
  }
+  return RUN_ALL_TESTS();
 }
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
@@ -48,6 +49,10 @@ function(op_library TARGET)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()

+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
@@ -56,106 +61,28 @@ function(op_library TARGET)
                ${op_common_deps})
    endif()

-    # net_op doesn't need pybind
-    if ("${TARGET}" STREQUAL "net_op")
-        set(pybind_flag 1)
-    endif()
-
-    if ("${TARGET}" STREQUAL "compare_op")
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
-    endif()
-
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
-
-    # conv_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
-    endif()
-
-    # pool_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
-    endif()
-
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
-    endif()
-
-    if ("${TARGET}" STREQUAL "logical_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
-    endif()
-
-    # pool_with_index_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_with_index_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
-    endif()
-
-    # conv_transpose_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_transpose_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
-    endif()
-
-    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
-    endif()
-
-    # save_restore_op contains several operators
-    if ("${TARGET}" STREQUAL "save_restore_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
-    endif()
-
-    # activation_op contains several operators
-    if ("${TARGET}" STREQUAL "activation_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
-    endif()
-
-    # nccl_op contains several operators
-    if ("${TARGET}" STREQUAL "nccl_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
-    endif()
-
-    # reduce_op contains several operators
-    if ("${TARGET}" STREQUAL "reduce_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
        endif()
+    endforeach()

-    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
    endif()

    # pybind USE_NO_KERNEL_OP
    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
    string(REPLACE "_op" "" TARGET "${TARGET}")
    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -166,7 +93,6 @@ function(op_library TARGET)
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-
    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
@@ -181,58 +107,31 @@ endfunction()
 add_subdirectory(math)
 add_subdirectory(nccl)

-set(DEPS_OPS
-    cond_op
-    cross_entropy_op
-    recurrent_op
-    softmax_with_cross_entropy_op
-    softmax_op
-    sequence_softmax_op
-    sum_op
-    pool_op
-    maxout_op
-    unpool_op
-    pool_with_index_op
-    conv_op
-    conv_transpose_op
-    nccl_op
-    sequence_conv_op
-    sequence_pool_op
-    lod_rank_table_op
-    lod_tensor_to_array_op
-    array_to_lod_tensor_op
-    max_sequence_len_op
-    lstm_op
-    tensor_array_read_write_op
-    gru_op
-    adagrad_op
-    sgd_op
-    save_op
-    load_op
-    send_op
-    recv_op)
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()

 if(WITH_DISTRIBUTE)
-add_subdirectory(detail)
-op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    send_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    recv_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
 endif()

-op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
+op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
@@ -242,21 +141,17 @@ op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
-op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
-op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
-if(WITH_GPU)
-op_library(nccl_op DEPS nccl_common)
-endif()
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
-
+op_library(recurrent_op DEPS executor)
+op_library(cos_sim_op DEPS cos_sim_functor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
@@ -265,9 +160,10 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")

-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")


 cc_test(gather_test SRCS gather_test.cc DEPS tensor)

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -105,48 +105,18 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
    // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
    auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    for (size_t i = 0; i < grad_rows.size(); i++) {
-      size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
-      for (int64_t j = 0; j < grad_width; j++) {
-        grad_merge_data[grad_merge_i * grad_width + j] +=
-            grad_data[i * grad_width + j];
-      }
-    }
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();

    // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.eigen_device()) = gm * gm;
+    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);

    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
-    functor(context, *grad_square, moment);
+    functor(context, grad_square, moment);

    // 3. update parameter
    auto* lr = learning_rate.data<T>();

--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -78,62 +78,30 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
    // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
    auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid1(1, grad_rows.size());
-
-    MergeGradKernel<
-        T, 256><<<grid1, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_data, grad.rows().data(),
-                                   grad_merge_data, grad_merge->rows().data(),
-                                   grad_merge->rows().size(), grad_width);
-
+    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+    auto& merge_rows = grad_merge.rows();
    // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.eigen_device()) = gm * gm;
+    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);

    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
-    functor(context, *grad_square, moment);
+    functor(context, grad_square, moment);

    // 3. update parameter
    auto* lr = learning_rate.data<T>();
    auto* param_data = param->data<T>();
    auto* moment_data = moment->data<T>();

+    const int block_size = 256;
+    dim3 threads(block_size, 1);
    dim3 grid2(1, merge_rows.size());
    SparseAdagradFunctorKernel<
        T, 256><<<grid2, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge->rows().data(),
+                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
                                   lr, param_data, moment_data, grad_width,
                                   epsilon);
  }

--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -16,11 +16,14 @@ limitations under the License. */
 #include <math.h>  // for sqrt in CPU and CUDA
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/for_range.h"

 namespace paddle {
 namespace operators {

+namespace scatter = paddle::operators::math::scatter;
+
 template <typename T>
 struct AdamFunctor {
  T beta1_;
@@ -79,6 +82,69 @@ struct AdamFunctor {
  }
 };

+template <typename T>
+struct SparseAdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    for (int64_t j = 0; j < row_numel_; ++j) {
+      T g = grad_[i * row_numel_ + j];
+      T mom1 = moment1_[rows_[i] * row_numel_ + j];
+      T mom2 = moment2_[rows_[i] * row_numel_ + j];
+      T lr = *lr_;
+      T p = param_[rows_[i] * row_numel_ + j];
+
+      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
+      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
+      param_out_[rows_[i] * row_numel_ + j] = p;
+    }  // for col id
+  }
+};
+
 template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
 public:
@@ -90,7 +156,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
-    auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
    auto& lr =
@@ -108,9 +175,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
    auto& mom2_out =
        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");

-    AdamFunctor<T> functor(beta1, beta2, epsilon, beta1_pow.template data<T>(),
-                           beta2_pow.template data<T>(),
-                           mom1.template data<T>(),
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+      AdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
          mom1_out.template mutable_data<T>(ctx.GetPlace()),
          mom2.template data<T>(),
          mom2_out.template mutable_data<T>(ctx.GetPlace()),
@@ -118,8 +187,36 @@ class AdamOpKernel : public framework::OpKernel<T> {
          param.template data<T>(),
          param_out.template mutable_data<T>(ctx.GetPlace()));
      platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), param.numel());
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param.numel());
      for_range(functor);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto& grad =
+          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      // merge duplicated rows if any.
+      scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto grad_merge =
+          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      auto* rows = grad_merge.rows().data();
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+
+      SparseAdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad_data, param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          grad_merge.rows().size());
+      for_range(functor);
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
  }
 };


--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);

    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    size_t offset;
    if (platform::is_gpu_place(i_tensor.place())) {

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
        }
        auto slice = out->Slice(out_offset, out_offset + len);

-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);

        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                            dev_ctx, &slice);

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
        out != nullptr,
        "The Output(Out) should not be null if the Input(X) is set.");

-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
  }

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -50,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");

-    const float epsilon = ctx->Attrs().Get<float>("epsilon");
-    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
-    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
-
    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                      "Mean and MeanOut should share the same memory");
@@ -91,7 +87,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<bool>("is_test", "").SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "").SetDefault(1e-5);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
    AddInput("X", "The input tensor");
    AddInput("Scale",

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope& scope,
           const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-    auto& dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);

    framework::ExecutionContext ctx(*this, scope, dev_ctx);


--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,

 void CondOp::Run(const Scope& scope, const platform::Place& place) const {
  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto& dev_ctx = *pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);

  PrepareDataForSubnet(scope, dev_ctx);
  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);

--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

+REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
+                   paddle::operators::CudnnConvOpKernel<float>,
+                   paddle::operators::CudnnConvOpKernel<double>);
+
 REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
                        paddle::operators::CudnnConvOpKernel<float>,
                        paddle::operators::CudnnConvOpKernel<double>);

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  int groups = ctx->Attrs().Get<int>("groups");
  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int input_channels = in_dims[1];
-  int output_channels = filter_dims[0];

  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "Conv intput should be 4-D or 5-D tensor.");
@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(
      paddings.size(), strides.size(),
      "Conv paddings dimension and Conv strides dimension should be the same.");
+
+  int input_channels = in_dims[1];
  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                    "The number of input channels should be equal to filter "
                    "channels * groups.");
+
+  int output_channels = filter_dims[0];
  PADDLE_ENFORCE_EQ(
      output_channels % groups, 0,
      "The number of output channels should be divided by groups.");

--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -13,19 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/for_range.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
@@ -41,28 +37,25 @@ class CosSimKernel : public framework::OpKernel<T> {
    out_x_norm->mutable_data<T>(context.GetPlace());
    out_y_norm->mutable_data<T>(context.GetPlace());

-    // convert Tensor to Eigen Tensor
    int rows_x = in_x->dims()[0];
    int rows_y = in_y->dims()[0];
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenVector<T>::Flatten(*out_z);
-    auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
-    auto y_norm = EigenVector<T>::Flatten(*out_y_norm);

-    // compute
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto row_along = Eigen::array<int, 1>({{1}});
-    x_norm.device(place) = x.square().sum(row_along).sqrt();
-    y_norm.device(place) = y.square().sum(row_along).sqrt();
+    int cols = framework::product(in_x->dims()) / rows_x;
+
    if (rows_x == rows_y) {
-      auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
-      z.device(place) = xy / x_norm / y_norm;
+      math::CosSimFunctor<T, true> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
    } else {
-      Eigen::DSizes<int, 2> bcast(rows_x, 1);
-      auto xy = (x * y.broadcast(bcast)).sum(row_along);
-      z.device(place) = xy / x_norm / y_norm.broadcast(bcast);
+      math::CosSimFunctor<T, false> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
    }
  }
 };
@@ -81,62 +74,54 @@ class CosSimGradKernel : public framework::OpKernel<T> {
    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));

-    // convert Tensor to Eigen Tensor
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenMatrix<T>::Reshape(*in_z, 1);
-    auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
-    auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
-    auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
-
    // compute gradident
    int rows_x = in_x->dims()[0];
    int rows_y = in_y->dims()[0];
    int cols = framework::product(in_x->dims()) / rows_x;
-    Eigen::DSizes<int, 2> bcast_cols(1, cols);
-    auto z_bcast = z.broadcast(bcast_cols);
-    auto dz_bcast = dz.broadcast(bcast_cols);
-    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+
    if (rows_x == rows_y) {
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
-      auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
-      // compute dx
      if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
      }
-      // compute dy
      if (out_grad_y) {
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
-        auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast;
-        dy.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
+            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
      }
    } else {
-      Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
-      Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
-      auto y_bcast = y.broadcast(bcast_rows);
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
-      auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
-                                 .eval()
-                                 .broadcast(bcast_cols);
-      // compute dx
      if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimDxFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
      }
-      // compute dy
      if (out_grad_y) {
        out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenVector<T>::Flatten(*out_grad_y);
-        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
-        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
+        math::SetConstant<DeviceContext, T> set_zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
+
+        math::CosSimDyFunctor<DeviceContext, T> functor;
+        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
+                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
+                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
+                static_cast<size_t>(cols), out_grad_y->data<T>());
      }
    }
  }

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
             "This input is a probability computed by the previous operator, "
             "which is almost always the result of a softmax operator.");
    AddInput("Label",
             "(Tensor), the ground truth which is a 2-D tensor. When "
             "soft_label is set to false, Label is a Tensor<int64> with shape "
             "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x K].");
+             "Tensor<float/double> with shape [N x D].");
    AddOutput("Y",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
              "[N x 1]. The cross entropy loss.");

--- a/paddle/operators/detection_output_op.cc
+++ b/paddle/operators/detection_output_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+namespace paddle {
+namespace operators {
+
+class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Loc",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input predict locations"
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is 4, H and W both are 1.");
+    AddInput("Conf",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input priorbox confidence."
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is the number of classes, H and W both are 1.");
+    AddInput("PriorBox",
+             "(Tensor) The input tensor of detection_output operator."
+             "The format of input tensor is the position and variance "
+             "of the boxes");
+    AddOutput("Out",
+              "(Tensor) The output tensor of detection_output operator.");
+    AddAttr<int>("background_label_id", "(int), The background class index.");
+    AddAttr<int>("num_classes", "(int), The number of the classification.");
+    AddAttr<float>("nms_threshold",
+                   "(float), The Non-maximum suppression threshold.");
+    AddAttr<float>("confidence_threshold",
+                   "(float), The classification confidence threshold.");
+    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
+    AddAttr<int>("nms_top_k",
+                 "(int), The bbox number kept of the NMS’s output.");
+    AddComment(R"DOC(
+          detection output for SSD(single shot multibox detector)
+          Apply the NMS to the output of network and compute the predict
+          bounding box location. The output’s shape of this layer could
+          be zero if there is no valid bounding box.
+        )DOC");
+  }
+};
+
+class DetectionOutputOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of DetectionOutputOp should not be null.");
+    std::vector<int64_t> output_shape({1, 7});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
+                             ops::DetectionOutputOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/detection_output_op.cu.cc
+++ b/paddle/operators/detection_output_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/detection_output_op.h
+++ b/paddle/operators/detection_output_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/detection_util.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/strided_memcpy.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+inline void transpose_fun(const framework::ExecutionContext& context,
+                          const framework::Tensor& src,
+                          framework::Tensor* dst) {
+  int input_nums = src.dims()[0];
+  int offset = 0;
+  for (int j = 0; j < input_nums; ++j) {
+    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
+    std::vector<int64_t> shape_vec(
+        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
+         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
+    framework::DDim shape(framework::make_ddim(shape_vec));
+    framework::Tensor in_p_tensor_transpose;
+    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
+    std::vector<int> shape_axis({0, 1, 3, 4, 2});
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
+           &in_p_tensor_transpose, shape_axis);
+    auto dst_stride = framework::stride(dst->dims());
+    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
+    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
+                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
+                     dst->data<T>() + offset);
+    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
+  }
+}
+template <typename DeviceContext, typename T>
+class DetectionOutputKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
+    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
+    const framework::Tensor* in_priorbox =
+        context.Input<framework::Tensor>("PriorBox");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int num_classes = context.template Attr<int>("num_classes");
+    int top_k = context.template Attr<int>("top_k");
+    int nms_top_k = context.template Attr<int>("nms_top_k");
+    int background_label_id = context.template Attr<int>("background_label_id");
+    float nms_threshold = context.template Attr<float>("nms_threshold");
+    float confidence_threshold =
+        context.template Attr<float>("confidence_threshold");
+    size_t batch_size = in_conf->dims()[1];
+    int conf_sum_size = in_conf->numel();
+    // for softmax
+    std::vector<int64_t> conf_shape_softmax_vec(
+        {conf_sum_size / num_classes, num_classes});
+    framework::DDim conf_shape_softmax(
+        framework::make_ddim(conf_shape_softmax_vec));
+    // for knchw => nhwc
+    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
+                                        in_loc->dims()[4],
+                                        in_loc->dims()[2] * in_loc->dims()[0]});
+    std::vector<int64_t> conf_shape_vec(
+        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
+         in_conf->dims()[2] * in_conf->dims()[0]});
+    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
+    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
+    framework::Tensor loc_tensor;
+    framework::Tensor conf_tensor;
+    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
+    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    // for cpu
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* priorbox_data = in_priorbox->data<T>();
+    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
+    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
+    conf_tensor.Resize(conf_shape_softmax);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), &conf_tensor,
+        &conf_tensor);
+    T* loc_data = loc_tensor.data<T>();
+    T* conf_data = conf_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
+      framework::CopyFrom(loc_tensor, platform::CPUPlace(),
+                          context.device_context(), &loc_cpu);
+      loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
+      framework::CopyFrom(conf_tensor, platform::CPUPlace(),
+                          context.device_context(), &conf_cpu);
+      conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::CopyFrom(*in_priorbox, platform::CPUPlace(),
+                          context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+    }
+    // get decode bboxes
+    size_t num_priors = in_priorbox->numel() / 8;
+    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
+    for (size_t n = 0; n < batch_size; ++n) {
+      std::vector<operators::math::BBox<T>> decoded_bboxes;
+      for (size_t i = 0; i < num_priors; ++i) {
+        size_t prior_offset = i * 8;
+        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
+        std::vector<math::BBox<T>> prior_bbox_vec;
+        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                      prior_bbox_vec);
+        std::vector<std::vector<T>> prior_bbox_var;
+        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                         prior_bbox_var);
+        std::vector<T> loc_pred_data;
+        for (size_t j = 0; j < 4; ++j)
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
+        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
+            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
+        decoded_bboxes.push_back(bbox);
+      }
+      all_decoded_bboxes.push_back(decoded_bboxes);
+    }
+    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
+    int num_kept = math::GetDetectionIndices<T>(
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
+        all_decoded_bboxes, &all_indices);
+
+    if (num_kept <= 0) {
+      std::vector<int64_t> out_shape_vec({0, 0});
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out->Resize(out_shape);
+      return;
+    }
+    std::vector<int64_t> out_shape_vec({num_kept, 7});
+    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+    out->mutable_data<T>(out_shape, context.GetPlace());
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::CopyFrom(out_cpu, platform::CUDAPlace(),
+                          context.device_context(), out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);

    auto x_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim("Out", x_dims);
@@ -47,7 +45,11 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();

    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f);
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);

@@ -78,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) must not be null.");

-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
    auto x_dims = ctx->GetInputDim("X");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_EQ(x_dims, out_dims,

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -30,16 +30,15 @@ struct MaskGenerator {
  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
      : dropout_prob(dropout_prob), seed(seed) {}

-  __host__ __device__ T operator()(const unsigned int n) const {
+  inline __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed);
    thrust::uniform_real_distribution<AttrType> dist(0, 1);
    rng.discard(n);
    if (dist(rng) < dropout_prob) {
      return static_cast<T>(0);
-    } else {
-      return static_cast<T>(1);
    }
+    return static_cast<T>(1);
  }
 };


--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();

    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
    out_item->set_lod(feed_item.lod());

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {

    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
    dev_ctx.Wait();

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }

-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
    math::set_constant(dev_ctx, &out, value);
  }
 };

--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {

    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      framework::CopyFrom(tensor, place, dev_ctx, &out);
    }
  }

--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
@@ -70,7 +71,7 @@ class GRUKernel : public framework::OpKernel<T> {
    }

    int frame_size = hidden_dims[1];
-    math::hl_gru_value<T> gru_value;
+    math::GRUMetaValue<T> gru_value;
    gru_value.gate_weight = const_cast<T*>(weight_data);
    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -89,6 +90,10 @@ class GRUKernel : public framework::OpKernel<T> {
    }
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
    for (size_t n = 0; n < num_batch; n++) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
@@ -101,9 +106,8 @@ class GRUKernel : public framework::OpKernel<T> {
      gru_value.gate_value = gate_t.data<T>();
      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
      gru_value.prev_out_value = gru_value.output_value;
    }

@@ -170,12 +174,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
    batch_hidden_grad.set_lod(batch_hidden->lod());
    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);

-    math::hl_gru_value<T> gru_value;
+    math::GRUMetaValue<T> gru_value;
    gru_value.gate_weight = const_cast<T*>(weight_data);
    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);

-    math::hl_gru_grad<T> gru_grad;
+    math::GRUMetaGrad<T> gru_grad;
    if (weight_grad) {
      gru_grad.gate_weight_grad =
          weight_grad->mutable_data<T>(context.GetPlace());
@@ -189,6 +193,10 @@ class GRUGradKernel : public framework::OpKernel<T> {

    auto batch_starts = batch_hidden_grad.lod()[0];
    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
@@ -219,9 +227,8 @@ class GRUGradKernel : public framework::OpKernel<T> {
      }

      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
+          active_gate);
    }
    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
                   out_var_name);

    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-    framework::DeserializeFromStream(fin, tensor);
+    DeserializeFromStream(fin, tensor);

-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    if (platform::is_gpu_place(place)) {
      // copy CPU to GPU

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
        auto slice = out[i].Slice(static_cast<int>(offset),
                                  static_cast<int>(offset + len));

-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);

        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                    static_cast<int>(each_range.end)),

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -9,13 +9,14 @@ if(WITH_GPU)
    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -23,13 +24,14 @@ else()
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
    cc_library(pooling SRCS pooling.cc DEPS device_context)
    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
 endif()

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/cos_sim_functor.cc
+++ b/paddle/operators/math/cos_sim_functor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
+template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/cos_sim_functor.cu
+++ b/paddle/operators/math/cos_sim_functor.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
+                               const T* y, const T* z, const T* dz,
+                               const size_t rows, const size_t cols, T* dy) {
+  int grid_size = blockDim.x * gridDim.x;
+  T y_norm_data = y_norm[0];
+  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
+       row_id += grid_size) {
+    T xy_norm_prod = x_norm[row_id] * y_norm_data;
+    T dz_data = dz[row_id];
+    T z_data = z[row_id];
+    const T* x_data = x + cols * row_id;
+    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+    T y_norm_square = y_norm_data * y_norm_data;
+    T reciprocal_y_norm_square = 1 / y_norm_square;
+    for (size_t i = 0; i < cols; ++i) {
+      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                             z_data * y[i] * reciprocal_y_norm_square);
+      platform::CudaAtomicAdd(dy + i, dy_data);
+    }
+  }
+}
+
+template <typename T>
+struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    const int block_size = 512;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, (rows + block_size - 1) / block_size);
+    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
+  }
+};
+
+template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
+template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/cos_sim_functor.h
+++ b/paddle/operators/math/cos_sim_functor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                    const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                  const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename DeviceContext, typename T>
+struct CosSimDyFunctor {
+  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
+                  const T* x, const T* y, const T* z, const T* dz,
+                  const size_t rows, const size_t cols, T* dy) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -28,7 +28,7 @@ template <class OpResetOutput, typename T>
 void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
                                       T *gate_value, T *reset_output_value,
                                       T *prev_output_value, int frame_size,
-                                       activation_mode_t active_gate) {
+                                       ActivationType active_gate) {
  T r_value_update_gate;
  T r_value_reset_gate;
  T r_value_reset_output;
@@ -56,7 +56,7 @@ template <class OpFinalOutput, typename T>
 void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
                                       T *gate_value, T *prev_output_value,
                                       T *output_value, int frame_size,
-                                       activation_mode_t active_node) {
+                                       ActivationType active_node) {
  T r_value_update_gate;
  T r_value_frame_state;
  T r_prev_out = 0;
@@ -83,7 +83,7 @@ template <class OpResetOutput, typename T>
 void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
                                     T *gate_value, T *reset_output_value,
                                     T *prev_output_value, int frame_size,
-                                     activation_mode_t active_gate) {
+                                     ActivationType active_gate) {
 #ifdef __AVX__
  __m256 r_value_update_gate;
  __m256 r_value_reset_gate;
@@ -113,7 +113,7 @@ template <class OpFinalOutput, typename T>
 void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
                                     T *gate_value, T *prev_output_value,
                                     T *output_value, int frame_size,
-                                     activation_mode_t active_node) {
+                                     ActivationType active_node) {
 #ifdef __AVX__
  __m256 r_value_update_gate;
  __m256 r_value_frame_state;
@@ -140,9 +140,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,

 template <class OpResetOutput, typename T>
 inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 hl_gru_value<T> value, int frame_size,
-                                 int batch_size,
-                                 activation_mode_t active_gate) {
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
  for (int b = 0; b < batch_size; b++) {
    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_forward_reset_output(
@@ -164,9 +163,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output,

 template <class OpFinalOutput, typename T>
 inline void forward_final_output(OpFinalOutput op_final_output,
-                                 hl_gru_value<T> value, int frame_size,
-                                 int batch_size,
-                                 activation_mode_t active_node) {
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
  for (int b = 0; b < batch_size; b++) {
    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
@@ -191,7 +189,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
                                      T *gate_grad, T *prev_out_value,
                                      T *prev_out_grad, T *output_grad,
                                      int frame_size,
-                                      activation_mode_t active_node) {
+                                      ActivationType active_node) {
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_frame_state_value;
@@ -232,7 +230,7 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
                                      T *gate_grad, T *prev_out_value,
                                      T *prev_out_grad, T *reset_output_grad,
                                      int frame_size,
-                                      activation_mode_t active_gate) {
+                                      ActivationType active_gate) {
  T r_update_gate_value;
  T r_update_gate_grad;
  T r_reset_gate_value;
@@ -277,7 +275,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
                                    T *gate_grad, T *prev_out_value,
                                    T *prev_out_grad, T *output_grad,
                                    int frame_size,
-                                    activation_mode_t active_node) {
+                                    ActivationType active_node) {
 #ifdef __AVX__
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
@@ -320,7 +318,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
                                    T *gate_grad, T *prev_out_value,
                                    T *prev_out_grad, T *reset_output_grad,
                                    int frame_size,
-                                    activation_mode_t active_gate) {
+                                    ActivationType active_gate) {
 #ifdef __AVX__
  __m256 r_update_gate_value;
  __m256 r_update_gate_grad;
@@ -364,9 +362,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,

 template <class OpStateGrad, typename T>
 inline void backward_state_grad(OpStateGrad op_state_grad,
-                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                                int frame_size, int batch_size,
-                                activation_mode_t active_node) {
+                                ActivationType active_node) {
  for (int b = 0; b < batch_size; b++) {
    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_backward_state_grad(
@@ -393,9 +391,9 @@ inline void backward_state_grad(OpStateGrad op_state_grad,

 template <class OpResetGrad, typename T>
 inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                                int frame_size, int batch_size,
-                                activation_mode_t active_gate) {
+                                ActivationType active_gate) {
  for (int b = 0; b < batch_size; b++) {
    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_backward_reset_grad(

--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/platform/cuda_helper.h"
 #include "paddle/platform/device_context.h"

-#include <glog/logging.h>
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -35,7 +33,7 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
                                        T *gate_value, T *reset_output_value,
                                        T *prev_output_value, int frame_size,
                                        int batch_size,
-                                        activation_mode_t active_gate) {
+                                        ActivationType active_gate) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;

@@ -74,7 +72,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
                                        T *gate_value, T *prev_output_value,
                                        T *output_value, int frame_size,
                                        int batch_size,
-                                        activation_mode_t active_node) {
+                                        ActivationType active_node) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
  int batch_idx = 0;
@@ -111,7 +109,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
                                       T *gate_grad, T *prev_out_value,
                                       T *prev_out_grad, T *output_grad,
                                       int frame_size, int batch_size,
-                                       activation_mode_t active_node) {
+                                       ActivationType active_node) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
  int batch_idx = 0;
@@ -159,7 +157,7 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
                                       T *gate_grad, T *prev_out_value,
                                       T *prev_out_grad, T *reset_output_grad,
                                       int frame_size, int batch_size,
-                                       activation_mode_t active_gate) {
+                                       ActivationType active_gate) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (frame_idx >= frame_size) return;
  int batch_idx = 0;

--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -30,7 +30,7 @@ class gru_resetOutput {
 public:
  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
                             T &prev_out, T &value_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
    value_update_gate = activation(value_update_gate, act_gate);
    value_reset_gate = activation(value_reset_gate, act_gate);
    value_reset_output = prev_out * value_reset_gate;
@@ -43,7 +43,7 @@ class gru_resetOutput {
  HOSTDEVICE void operator()(__m256 &value_update_gate,
                             __m256 &value_reset_gate, __m256 &prev_out,
                             __m256 &value_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
    value_update_gate = activation(value_update_gate, act_gate);
    value_reset_gate = activation(value_reset_gate, act_gate);
    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
@@ -57,7 +57,7 @@ class gru_finalOutput {
 public:
  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
                             T &prev_out, T &value_output,
-                             activation_mode_t act_input) {
+                             ActivationType act_input) {
    value_frame_state = activation(value_frame_state, act_input);
    value_output = prev_out - (value_update_gate * prev_out) +
                   (value_update_gate * value_frame_state);
@@ -69,8 +69,7 @@ class gru_finalOutput {
  static const bool avx = true;
  HOSTDEVICE void operator()(__m256 &value_update_gate,
                             __m256 &value_frame_state, __m256 &prev_out,
-                             __m256 &value_output,
-                             activation_mode_t act_input) {
+                             __m256 &value_output, ActivationType act_input) {
    value_frame_state = activation(value_frame_state, act_input);
    value_output = _mm256_add_ps(
        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
@@ -89,7 +88,7 @@ class gru_stateGrad {
  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
                             T &value_frame_state, T &grad_frame_state,
                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_output, activation_mode_t act_input) {
+                             T &grad_output, ActivationType act_input) {
    grad_update_gate = (grad_output * value_frame_state);
    grad_update_gate -= (grad_output * value_prev_out);
    grad_prev_out -= (grad_output * value_update_gate);
@@ -107,7 +106,7 @@ class gru_stateGrad {
                             __m256 &value_frame_state,
                             __m256 &grad_frame_state, __m256 &value_prev_out,
                             __m256 &grad_prev_out, __m256 &grad_output,
-                             activation_mode_t act_input) {
+                             ActivationType act_input) {
    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
    grad_update_gate = _mm256_sub_ps(
        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
@@ -128,7 +127,7 @@ class gru_resetGrad {
  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
                             T &value_reset_gate, T &grad_reset_gate,
                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_reset_output, activation_mode_t act_gate) {
+                             T &grad_reset_output, ActivationType act_gate) {
    grad_reset_gate = (grad_reset_output * value_prev_out);
    grad_prev_out += (grad_reset_output * value_reset_gate);
    grad_update_gate =
@@ -144,7 +143,7 @@ class gru_resetGrad {
                             __m256 &grad_update_gate, __m256 &value_reset_gate,
                             __m256 &grad_reset_gate, __m256 &value_prev_out,
                             __m256 &grad_prev_out, __m256 &grad_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
    grad_prev_out = _mm256_add_ps(
        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));

--- a/paddle/operators/math/detection_util.h
+++ b/paddle/operators/math/detection_util.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct BBox {
+  BBox(T x_min, T y_min, T x_max, T y_max)
+      : x_min(x_min),
+        y_min(y_min),
+        x_max(x_max),
+        y_max(y_max),
+        is_difficult(false) {}
+
+  BBox() {}
+
+  T get_width() const { return x_max - x_min; }
+
+  T get_height() const { return y_max - y_min; }
+
+  T get_center_x() const { return (x_min + x_max) / 2; }
+
+  T get_center_y() const { return (y_min + y_max) / 2; }
+
+  T get_area() const { return get_width() * get_height(); }
+
+  // coordinate of bounding box
+  T x_min;
+  T y_min;
+  T x_max;
+  T y_max;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool is_difficult;
+};
+// KNCHW ==> NHWC
+// template <typename T>
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec);
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec);
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data);
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2);
+template <typename T>
+bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
+                          const std::pair<T, BBox<T>>& pair2);
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices);
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox);
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec) {
+  size_t out_offset = bbox_vec.size();
+  bbox_vec.resize(bbox_vec.size() + num_bboxes);
+  for (size_t i = 0; i < num_bboxes; ++i) {
+    BBox<T> bbox;
+    bbox.x_min = *(prior_data + i * 8);
+    bbox.y_min = *(prior_data + i * 8 + 1);
+    bbox.x_max = *(prior_data + i * 8 + 2);
+    bbox.y_max = *(prior_data + i * 8 + 3);
+    bbox_vec[out_offset + i] = bbox;
+  }
+}
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec) {
+  size_t out_offset = var_vec.size();
+  var_vec.resize(var_vec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    std::vector<T> var;
+    var.push_back(*(prior_data + i * 8 + 4));
+    var.push_back(*(prior_data + i * 8 + 5));
+    var.push_back(*(prior_data + i * 8 + 6));
+    var.push_back(*(prior_data + i * 8 + 7));
+    var_vec[out_offset + i] = var;
+  }
+}
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data) {
+  T prior_bbox_width = prior_bbox.get_width();
+  T prior_bbox_height = prior_bbox.get_height();
+  T prior_bbox_center_x = prior_bbox.get_center_x();
+  T prior_bbox_center_y = prior_bbox.get_center_y();
+
+  T decoded_bbox_center_x =
+      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
+      prior_bbox_center_x;
+  T decoded_bbox_center_y =
+      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
+      prior_bbox_center_y;
+  T decoded_bbox_width =
+      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
+  T decoded_bbox_height =
+      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
+
+  BBox<T> decoded_bbox;
+  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
+  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
+  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
+  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
+
+  return decoded_bbox;
+}
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
+  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
+      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
+    return 0.0;
+  } else {
+    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
+    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
+    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
+    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
+
+    T inter_width = interX_max - inter_x_min;
+    T inter_height = interY_max - inter_y_min;
+    T inter_area = inter_width * inter_height;
+
+    T bbox_area1 = bbox1.get_area();
+    T bbox_area2 = bbox2.get_area();
+
+    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+  }
+}
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices) {
+  std::vector<std::pair<T, size_t>> scores;
+  for (size_t i = 0; i < num_priors; ++i) {
+    size_t conf_offset = i * num_classes + class_idx;
+    if (conf_score_data[conf_offset] > conf_threshold)
+      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(),
+                   SortScorePairDescend<T, size_t>);
+  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t saved_idx = (*indices)[i];
+        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
+        keep = overlap <= nms_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
+  int total_keep_num = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+    size_t num_detected = 0;
+    std::map<size_t, std::vector<size_t>> indices;
+    size_t conf_offset = n * num_priors * num_classes;
+    for (size_t c = 0; c < num_classes; ++c) {
+      if (c == background_label_id) continue;
+      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
+                      conf_threshold, nms_threshold, num_priors, num_classes,
+                      &(indices[c]));
+      num_detected += indices[c].size();
+    }
+    if (top_k > 0 && num_detected > top_k) {
+      // std::vector<pair<T,T>> score_index_pairs;
+      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
+      for (size_t c = 0; c < num_classes; ++c) {
+        const std::vector<size_t>& label_indices = indices[c];
+        for (size_t i = 0; i < label_indices.size(); ++i) {
+          size_t idx = label_indices[i];
+          score_index_pairs.push_back(
+              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<T, std::pair<size_t, size_t>>);
+      score_index_pairs.resize(top_k);
+      std::map<size_t, std::vector<size_t>> new_indices;
+      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
+        size_t label = score_index_pairs[i].second.first;
+        size_t idx = score_index_pairs[i].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_detection_indices->push_back(new_indices);
+      total_keep_num += top_k;
+    } else {
+      all_detection_indices->push_back(indices);
+      total_keep_num += num_detected;
+    }
+  }
+  return total_keep_num;
+}
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox) {
+  T one = static_cast<T>(1.0);
+  T zero = static_cast<T>(0.0);
+  BBox<T> clipped_bbox;
+  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
+  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
+  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
+  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
+  return clipped_bbox;
+}
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
+  size_t count = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
+             all_indices[n].begin();
+         it != all_indices[n].end(); ++it) {
+      size_t label = it->first;
+      const std::vector<size_t>& indices = it->second;
+      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
+        out_data[count * 7] = n;
+        out_data[count * 7 + 1] = label;
+        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
+        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
+        out_data[count * 7 + 3] = clipped_bbox.x_min;
+        out_data[count * 7 + 4] = clipped_bbox.y_min;
+        out_data[count * 7 + 5] = clipped_bbox.x_max;
+        out_data[count * 7 + 6] = clipped_bbox.y_max;
+        ++count;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,9 +21,9 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
    if (value.prev_out_value) {
      math::gemm<platform::CPUDeviceContext, T>(
@@ -51,10 +51,10 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                      int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                grad, frame_size, batch_size, active_node);

--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,9 +21,9 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
  static void compute(const platform::CUDADeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
    auto stream = context.stream();
    dim3 threads;
    dim3 grid;
@@ -88,10 +88,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
  static void compute(const platform::CUDADeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                      int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
    auto stream = context.stream();
    dim3 threads;
    dim3 grid;

--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"

@@ -19,9 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {

-// TODO(guosheng): refine code style in gru_compute
 template <typename T>
-struct hl_gru_value {
+struct GRUMetaValue {
  T *gate_weight;
  T *state_weight;
  T *gate_value;
@@ -31,7 +30,7 @@ struct hl_gru_value {
 };

 template <typename T>
-struct hl_gru_grad {
+struct GRUMetaGrad {
  T *gate_weight_grad;
  T *state_weight_grad;
  T *gate_grad;
@@ -42,18 +41,18 @@ struct hl_gru_grad {

 template <typename DeviceContext, typename T>
 struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                      int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate);
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };

 template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, hl_gru_value<T> value,
-                      hl_gru_grad<T> grad, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate);
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };

 }  // namespace math

--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -22,14 +22,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} activation_mode_t;
-
 template <class T>
 struct LstmMetaValue {
  T *gate_value;
@@ -54,20 +46,6 @@ struct LstmMetaGrad {
  T *check_og_grad;
 };

-inline activation_mode_t ActiveType(const std::string &type) {
-  if (type == "sigmoid") {
-    return HL_ACTIVATION_SIGMOID;
-  } else if (type == "relu") {
-    return HL_ACTIVATION_RELU;
-  } else if (type == "tanh") {
-    return HL_ACTIVATION_TANH;
-  } else if (type == "linear" || type == "identity" || type == "") {
-    return HL_ACTIVATION_LINEAR;
-  } else {
-    PADDLE_THROW("Do not support activation type.");
-  }
-}
-
 template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
 public:

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -247,7 +247,10 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;

 #define DEFINE_CPU_TRANS(RANK)                                          \
  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;

 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -302,8 +305,29 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }

+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
 template struct RowwiseAdd<platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;


--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
                           TensorSetConstantGPU(context, tensor, value));
 }

+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;

--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }

-template <typename DeviceContext, typename T>
-void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              const framework::Tensor& vector,
-                                              framework::Tensor* output) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector.numel(), size);
-  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenMatrix<T>::From(vector);
-  auto out = framework::EigenMatrix<T>::From(*output);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.eigen_device()) =
-      in + vec.reshape(shape).broadcast(bcast);
-}
-
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,

--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/operators/math/selected_rows_functor.h"
+#include <set>
+
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"

 namespace paddle {
 namespace operators {
@@ -179,6 +181,118 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;

+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+
+template <typename T>
+struct MergeAdd<platform::CPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      for (int64_t j = 0; j < input_width; j++) {
+        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+      }
+    }
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext, int>;
+template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct UpdateToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <set>
+
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"
@@ -222,6 +224,157 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+
+namespace scatter {
+
+template <typename T, int block_size>
+__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
+                               T* out, const int64_t* out_rows,
+                               size_t out_rows_size, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t out_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < out_rows_size; i++) {
+      if (input_rows[ty] == out_rows[i]) {
+        out_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  input += ty * row_numel;
+  out += out_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+  }
+}
+
+template <typename T>
+struct MergeAdd<platform::CUDADeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, input_rows.size());
+
+    MergeAddKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(input_data, input.rows().data(), out_data,
+                                   out.rows().data(), out.rows().size(),
+                                   input_width);
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CUDADeviceContext, float>;
+template struct MergeAdd<platform::CUDADeviceContext, double>;
+template struct MergeAdd<platform::CUDADeviceContext, int>;
+template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+
+template <typename T, int block_size>
+__global__ void UpdateToTensorKernel(const T* selected_rows,
+                                     const int64_t* rows, const ScatterOps& op,
+                                     T* tensor_out, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+  // FIXME(typhoonzero): use macro fix the below messy code.
+  switch (op) {
+    case ScatterOps::ASSIGN:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index];
+      }
+      break;
+    case ScatterOps::ADD:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] += selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUB:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] -= selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUBBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] - tensor_out[index];
+      }
+      break;
+    case ScatterOps::MUL:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] *= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIV:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] /= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIVBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] / tensor_out[index];
+      }
+      break;
+  }
+}
+
+template <typename T>
+struct UpdateToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    // NOTE: Use SelectedRowsAddToTensor for better performance
+    //       no additional MergeAdd called.
+    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto merged_in1 = merge_func(context, input1);
+
+    auto in1_height = merged_in1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = merged_in1.value();
+    auto& in1_rows = merged_in1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.template data<T>();
+    auto* in2_data = input2->data<T>();
+
+    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 grid(1, in1_rows.size());
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
+                                              in2_data, in1_row_numel);
+  }
+};
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/platform/device_context.h"

+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -52,6 +57,78 @@ struct SelectedRowsAddToTensor {
                  framework::Tensor* input2);
 };

+namespace scatter {
+// functors for manuplating SelectedRows data
+template <typename DeviceContext, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input);
+};
+
+template <typename DeviceContext, typename T>
+struct Add {
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+    return out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct Mul {
+  // multiply two SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+    return out;
+  }
+  // multiply scalar to SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const T input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    e_out.device(*context.eigen_device()) = input2 * e_in1;
+    return out;
+  }
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename DeviceContext, typename T>
+struct UpdateToTensor {
+  void operator()(const DeviceContext& context, const ScatterOps& op,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::Place &dev_place) const override {
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);

    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
  }

  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);

  testing::InitGoogleTest(&argc, argv);


--- a/paddle/operators/norm_op.cc
+++ b/paddle/operators/norm_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/norm_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename AttrType>
+class NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of norm operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Scale",
+             "(Tensor) The input tensor of norm operator. "
+             "The format of input tensor is C * 1.");
+    AddAttr<AttrType>("epsilon",
+                      "(float, default 1e-10) Constant "
+                      "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddOutput("Out",
+              "(Tensor) The output tensor of norm operator."
+              "N * M."
+              "M = C * H * W");
+    AddComment(R"DOC(
+       "Input shape: $(N, C, H, W)$
+        Sclae shape: $(C, 1)$
+        Output shape: $(N, C, H, W)$
+        Where
+        forward
+          $$
+            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
+          $$
+        backward
+          $$
+            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
+          $$
+        )DOC");
+  }
+};
+
+class NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of NormOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", in_x_dims);
+  }
+};
+
+class NormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+            ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
--- a/paddle/operators/norm_op.cu
+++ b/paddle/operators/norm_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
--- a/paddle/operators/norm_op.h
+++ b/paddle/operators/norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor out_batch = out->Slice(n, n + 1);
+      auto out_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              out_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
+                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
+      // get colsum  and sqrt , inverse
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp.device(*place) = x_square_batch_eigen.sum(dim);
+      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      out_batch_eigen.device(*place) =
+          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      out_batch_eigen.device(*place) =
+          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+      auto in_g_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_g_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+      auto outg_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              outg_batch, framework::make_ddim({channels, fea_len}));
+
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
+      framework::Tensor norm_tmp_tensor;
+      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                      context.GetPlace());
+      auto norm_tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
+      norm_tmp_eigen.device(*place) =
+          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      in_g_batch_eigen.device(*place) =
+          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen /
+          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
+      // outg_batch_eigen + (in_g_batch_eigen * -1);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
                   false /*create_local_scope*/);

      // get device context from pool
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);

      // Copy inside::output -> outside::output
      //    outside::output[seq_offset: seq_offset + 1] = inside::output
@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
    auto *program = block->Program();

    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;

--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
    auto x_sliced = x.Slice(x_offset, x_offset + len);
    auto out_sliced = out->Slice(out_offset, out_offset + len);

-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
    out_offset += len;
    return out_offset;

--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
    auto &tensor = var->Get<framework::LoDTensor>();

    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    framework::SerializeToStream(fout, tensor, dev_ctx);
  }

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Recv operator

-This operator will recv tensor from send_op
+This operator will send tensor to recv_op.
 )DOC");
    AddAttr<std::vector<std::string>>("endpoints",
                                      "(string vector, default 127.0.0.1:6164)"

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());

    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);

    if (dout_var == nullptr) {  // dx_tensor fill zero
      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
@@ -116,9 +116,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
      auto height = dout_tensor.dims()[0];
      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
      framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
-      if (dx_tensor.dims()[0] < height) {
+      if (dx_tensor.dims()[0] > height) {
        auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
      }
    }

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
    auto &x_lod = x.lod();
    auto &mask_dim = mask.dims();

-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);

    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
    if (platform::is_cpu_place(mask.place())) {

--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -37,11 +37,11 @@ class SumKernel : public framework::OpKernel<T> {
    bool in_place = out_var == in_vars[0];

    if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = context.Output<Tensor>("Out");
+      auto *out = context.Output<LoDTensor>("Out");
+      if (!in_place) {
        out->mutable_data<T>(context.GetPlace());
-
+      }
      auto result = EigenVector<T>::Flatten(*out);
-
      if (!in_place) {
        math::SetConstant<DeviceContext, T> constant_functor;
        constant_functor(context.template device_context<DeviceContext>(), out,

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
    if (x_tensor.memory_size() > 0) {
      auto *out_tensor = &out->at(offset);

-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);

      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_tensor.lod());
@@ -129,11 +130,12 @@ class ReadFromArrayOp : public ArrayOp {
    auto &x_array = x->Get<framework::LoDTensorArray>();
    auto *out = scope.FindVar(Output("Out"));
    PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
    size_t offset = GetOffset(scope, place);
    if (offset < x_array.size()) {
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_array[offset].lod());
    } else {

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -25,12 +25,12 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;

-constexpr char kStepBlock[] = "sub_block";
-constexpr char kCondition[] = "Condition";
-constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParameters[] = "X";
-constexpr char kParamGrads[] = "X@GRAD";
-constexpr char kOutputs[] = "Out";
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";

 class WhileOp : public framework::OperatorBase {
 public:
@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kParameters,
+    AddInput(kX,
             "A set of variables, which are required by operators inside the "
             "block of While Op.")
        .AsDuplicable();
@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase {

      executor.Run(*program, *cur_scope_iter, block->ID(), false);

-      auto &pg_names = Outputs(kParamGrads);
-      auto &p_names = Inputs(kParameters);
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
        if (pg_names[param_id] == framework::kEmptyVarName) {
@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto *grad = new framework::OpDesc();
    grad->SetType("while_grad");
-    grad->SetInput(kParameters, Input(kParameters));
+    grad->SetInput(kX, Input(kX));

    // Not all of IGs will be generated by inner gradient operators of while op.
    // Ignore IGs that is not generated by the inside block.
-    auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
    std::unordered_set<std::string> all_outs;
    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
      for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
      }
    }

-    grad->SetOutput(framework::GradVarName(kParameters), igs);
+    grad->SetOutput(framework::GradVarName(kX), igs);

    grad->SetInput(kOutputs, Output(kOutputs));

@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
    std::unordered_set<std::string> block_ins;
    auto *fwd_block = this->grad_block_[0]->ParentBlock();
    {
-      for (auto &p : Input(kParameters)) {
+      for (auto &p : Input(kX)) {
        block_ins.insert(p);
      }
      for (auto &o : Output(kOutputs)) {
@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 public:
  void operator()(const framework::OpDesc &op_desc,
                  framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kParameters);
-    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));

    for (size_t i = 0; i < p_names.size(); ++i) {
      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 class WhileGradOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kParameters);
-    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
    ctx->HasInputs(kOutputs);
    ctx->HasInputs(framework::GradVarName(kOutputs));

-    auto p_names = ctx->Inputs(kParameters);
-    auto pg_names = ctx->Outputs(kParamGrads);
-    auto var_types = ctx->GetInputsVarType(kParameters);
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
    std::vector<std::string> names_to_set;
    std::vector<framework::DDim> dims_to_set;
    for (size_t i = 0; i < p_names.size(); ++i) {
      if (pg_names[i] == framework::kEmptyVarName) {
        continue;
      }
-      auto dims = ctx->GetInputsElementDim(kParameters, i);
+      auto dims = ctx->GetInputsElementDim(kX, i);
      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
        names_to_set.push_back(pg_names[i]);
        dims_to_set.push_back(dims);

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -17,7 +17,7 @@ namespace platform {

 DeviceContextPool* DeviceContextPool::pool = nullptr;

-const platform::DeviceContext* DeviceContextPool::Borrow(
+const platform::DeviceContext* DeviceContextPool::Get(
    const platform::Place& place) {
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
  return it->second;
 }

-std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-  std::vector<const platform::DeviceContext*> borrowed_contexts;
-  for (auto& place : places) {
-    auto it = device_contexts_.find(place);
-    if (it != device_contexts_.end()) {
-      borrowed_contexts.emplace_back(it->second);
-    } else {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-  }
-  return borrowed_contexts;
-}
-
 DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };

+template <typename Place>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
+};
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice;
@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
  cublasHandle_t cublas_handle_;
 };

+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
+
 class CUDNNDeviceContext : public CUDADeviceContext {
 public:
  explicit CUDNNDeviceContext(CUDAPlace place);
@@ -102,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
  cudnnHandle_t cudnn_handle_;
 };

-class DeviceGuard {
- public:
-  explicit DeviceGuard(int device) {
-    original_device_ = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(device);
-  }
-  ~DeviceGuard() { platform::SetDeviceId(original_device_); }
-
- private:
-  int original_device_;
-};
-
 #endif

 /*! \brief device context pool singleton */
@@ -121,13 +122,13 @@ class DeviceContextPool {
 public:
  explicit DeviceContextPool(const std::vector<platform::Place>& places);

-  static DeviceContextPool& Get() {
+  static DeviceContextPool& Instance() {
    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
    return *pool;
  }

  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
    if (pool == nullptr) {
      pool = new DeviceContextPool(places);
    }
@@ -135,13 +136,14 @@ class DeviceContextPool {
  }

  /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Borrow(const platform::Place& place);
-
-  /*! \brief  Return handle of multi-device context. */
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places);
+  const platform::DeviceContext* Get(const platform::Place& place);

-  ~DeviceContextPool() {}
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }

 private:
  static DeviceContextPool* pool;

--- a/paddle/platform/device_context_test.cu
+++ b/paddle/platform/device_context_test.cu
@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
  using paddle::platform::CPUPlace;
  using paddle::platform::CUDAPlace;

-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
-  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);

  std::vector<Place> gpu_places;
  int count = paddle::platform::GetCUDADeviceCount();
  for (int i = 0; i < count; ++i) {
-    gpu_places.emplace_back(CUDAPlace(i));
-  }
-  auto dev_ctxs = pool.Borrow(gpu_places);
-  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
-    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
-
-    // check same as CUDAPlace(i)
-    CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
-    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
  }
 }

 int main(int argc, char** argv) {
-  int dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
-                    "device count is "
-                 << dev_count;
-    return 0;
-  }
-
  std::vector<paddle::platform::Place> places;

  places.emplace_back(paddle::platform::CPUPlace());
@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
  }

  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);

  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();

--- a/paddle/platform/for_range.h
+++ b/paddle/platform/for_range.h
@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {

  template <typename Function>
  inline void operator()(Function func) const {
-    constexpr size_t num_threads = 1024;
+    constexpr int num_threads = 1024;
    int block_size = limit_ <= num_threads ? limit_ : num_threads;
    int grid_size = (limit_ + num_threads - 1) / num_threads;


--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
  }

  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);

  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();

--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once

 #include <iostream>
-
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"

 namespace paddle {
@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);

 std::ostream &operator<<(std::ostream &, const Place &);

+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -3,7 +3,7 @@
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
+`
    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
@@ -18,34 +18,134 @@ limitations under the License. */
 namespace paddle {
 namespace platform {

-ProfilerState kState = ProfilerState::kDisabled;
-uint32_t kNextThreadId = 0;
-std::mutex kAllEventListsMutex;
-std::list<std::shared_ptr<EventList>> kAllEventLists;
-thread_local std::shared_ptr<EventList> kEventList;
-thread_local int32_t kThreadId;
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind),
+      name_(std::move(name)),
+      thread_id_(thread_id),
+      has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedUs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+}
+
+double Event::CudaElapsedUs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms * 1000.0;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+                        dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  name_ = name;
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+                        dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  GetEventList().Record(EventKind::kPopRange, std::move(name_), g_thread_id,
+                        dev_ctx_);
+}

 void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                 "Can't enbale profling, since the input state is ",
                 "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(kState == ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
-  kState = state;
+  g_state = state;
 #ifdef PADDLE_WITH_CUDA
-  auto ForEachDevice = [](std::function<void(int)> op) {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; i++) {
-      DeviceGuard dev_guard(i);
-      op(i);
-    }
-  };
-  if (kState == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA) {
    // Generate some dummy evenets first to reduce the startup overhead.
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d));
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
      });
@@ -53,35 +153,36 @@ void EnableProfiler(ProfilerState state) {
  }
 #endif
  // Mark the profiling start.
-  Mark("_start_profiler_");
+  Mark("_start_profiler_", nullptr);
 }

 std::vector<std::vector<Event>> DisableProfiler() {
-  PADDLE_ENFORCE(kState != ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                 "Can't disable profiling, since it's not starting.");
  // Mark the profiling stop.
-  Mark("_stop_profiler_");
-  kState = ProfilerState::kDisabled;
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
  std::vector<std::vector<Event>> result;
-  std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-  for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) {
-    auto& list = *it;
-    result.emplace_back(list->Reduce());
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
  }
  return result;
 }

-void PushEvent(const std::string name, const platform::DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
                        dev_ctx);
 }

-void PopEvent(const std::string name, const platform::DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, std::move(name), kThreadId,
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, std::move(name), g_thread_id,
                        dev_ctx);
 }

-void ParseEvents(std::vector<std::vector<Event>> events) {
+void ParseEvents(std::vector<std::vector<Event>>& events) {
+  // Event name :: counts :: ave  ::  min   ::  max :: total
  std::map<std::string, std::tuple<int, double, double>> events_table;
  for (size_t i = 0; i < events.size(); i++) {
    std::list<Event> pushed_events;

--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -24,76 +24,24 @@ namespace platform {

 enum EventKind { kMark, kPushRange, kPopRange };

-inline uint64_t GetTimeInNsec() {
-  // using std::chrono;
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
 class Event {
 public:
-  // the DeviceContext is used to get the cuda stream.
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
  Event(EventKind kind, std::string name, uint32_t thread_id,
-        const platform::DeviceContext* dev_ctx = nullptr)
-      : kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
-    has_cuda_ = false;
-#ifdef PADDLE_WITH_CUDA
-    auto* cuda_dev_ctx =
-        static_cast<const platform::CUDADeviceContext*>(dev_ctx);
-    if (cuda_dev_ctx) {
-      PADDLE_ENFORCE(cudaGetDevice(&device_));
-      PADDLE_ENFORCE(cudaEventCreate(&event_));
-      auto stream = cuda_dev_ctx->stream();
-      PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-      has_cuda_ = true;
-    }
-#endif
-    cpu_ns_ = GetTimeInNsec();
-  }
-
-  std::string kind() const {
-    switch (kind_) {
-      case EventKind::kMark:
-        return "mark";
-      case EventKind::kPushRange:
-        return "push";
-      case EventKind::kPopRange:
-        return "pop";
-    }
-    PADDLE_THROW("Unknown EventKind.");
-  }
+        DeviceContext* dev_ctx);

+  std::string kind() const;
  std::string name() const { return name_; }
-
  bool has_cuda() const { return has_cuda_; }

 #ifdef PADDLE_WITH_CUDA
  cudaEvent_t event() const { return event_; }
-
  int device() const { return device_; }
 #endif

-  double CpuElapsedUs(const Event& e) const {
-    return (e.cpu_ns_ - cpu_ns_) / (1000.0);
-  }
-
-  double CudaElapsedUs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-    PADDLE_ENFORCE(e.device() == device());
-    PADDLE_ENFORCE(cudaEventSynchronize(event_));
-    PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-    float ms;
-    PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-    return ms * 1000.0;
-#else
-    PADDLE_THROW("CUDA is not enabled");
-#endif
-  }
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;

 private:
  EventKind kind_;
@@ -108,11 +56,11 @@ class Event {
 };

 struct EventList {
-  constexpr static std::size_t kMB = 1024 * 1024;
-  constexpr static std::size_t kEventBlockSize = 16 * kMB;
-  constexpr static std::size_t kEventSize = sizeof(Event);
-  constexpr static std::size_t kEventAlign = alignof(Event);
-  constexpr static std::size_t kNumBlock =
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

@@ -139,69 +87,36 @@ struct EventList {
 };

 enum ProfilerState {
-  kDisabled,
-  kCPU,
-  kCUDA,
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
 };

-// The profiler state, the initial value is ProfilerState::kDisabled
-extern ProfilerState kState;
-// The global mutex
-extern std::mutex kAllEventListsMutex;
-// The total event lists of all threads
-extern std::list<std::shared_ptr<EventList>> kAllEventLists;
-// The thread local event list only can be accessed by the specific thread
-extern thread_local std::shared_ptr<EventList> kEventList;
-// The thread index of each thread
-extern thread_local int32_t kThreadId;
-// The kNextThreadId is a global counter for threads, by the kThreadId and
-// kNextThreadId, we can know how many threads have created EventList.
-extern uint32_t kNextThreadId;
-
-inline EventList& GetEventList() {
-  if (!kEventList) {
-    std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-    kEventList = std::make_shared<EventList>();
-    kThreadId = kNextThreadId++;
-    kAllEventLists.emplace_front(kEventList);
-  }
-  return *kEventList;
-}
-
-inline void Mark(const std::string name,
-                 const platform::DeviceContext* dev_ctx = nullptr) {
-  GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
-}
+void Mark(const std::string& name, DeviceContext* dev_ctx);

-void PushEvent(const std::string name,
-               const platform::DeviceContext* dev_ctx = nullptr);
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);

-void PopEvent(const std::string name,
-              const platform::DeviceContext* dev_ctx = nullptr);
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);

 struct RecordEvent {
-  explicit RecordEvent(const std::string name,
-                       platform::DeviceContext* dev_ctx = nullptr) {
-    if (kState == ProfilerState::kDisabled) return;
-    dev_ctx_ = dev_ctx;
-    name_ = name;
-    GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
-                          dev_ctx_);
-  }
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);

-  ~RecordEvent() {
-    if (kState == ProfilerState::kDisabled) return;
-    GetEventList().Record(EventKind::kPopRange, std::move(name_), kThreadId,
-                          dev_ctx_);
-  }
-  platform::DeviceContext* dev_ctx_;
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
+  // Event name
  std::string name_;
 };

+// Enable the profiling function.
 void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();

-void ParseEvents(std::vector<std::vector<Event>>);
+void ParseEvents(std::vector<std::vector<Event>>&);

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
  using paddle::platform::Event;
  using paddle::platform::EventKind;

-  Event start_event(EventKind::kPushRange, "test", 0);
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
  EXPECT_TRUE(start_event.has_cuda() == false);
  int counter = 0;
  while (counter != 1000) {
    counter++;
  }
-  Event stop_event(EventKind::kPopRange, "test", 0);
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
 }

@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
 TEST(Event, CudaElapsedTime) {
  using paddle::platform::DeviceContext;
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  using paddle::platform::Event;
  using paddle::platform::EventKind;

-  DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0));
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
  EXPECT_TRUE(start_event.has_cuda() == true);
  int counter = 0;
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
  DeviceContext* dev_ctx = nullptr;
 #ifdef PADDLE_WITH_CUDA
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  state = ProfilerState::kCUDA;
  dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0));
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
 #endif
  EnableProfiler(state);

@@ -98,7 +98,9 @@ TEST(RecordEvent, RecordEvent) {
  int cuda_startup_count = 0;
  int start_profiler_count = 0;
  int stop_profiler_count = 0;
+
  ParseEvents(events);
+
  for (size_t i = 0; i < events.size(); ++i) {
    for (size_t j = 0; j < events[i].size(); ++j) {
      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -3,6 +3,9 @@ if(WITH_PYTHON)
    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
    DEPS pybind python backward proto_desc paddle_memory executor prune init
    ${GLOB_OP_LIB})
+  if(NOT APPLE AND NOT ANDROID)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT ANDROID)
 endif(WITH_PYTHON)

 if(WITH_DOC)

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
             std::string name = byte_name;
             return self.HasVar(name);
           })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
      .def("find_var",
           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVar(name);
           },
           py::return_value_policy::reference)
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
+           py::return_value_policy::reference)
      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
      .def("op_size", &BlockDesc::OpSize)
      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
      .def("set_shape", &VarDesc::SetShape)
      .def("set_dtype", &VarDesc::SetDataType)
      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
      .def("lod_level", &VarDesc::GetLodLevel)
      .def("set_lod_level", &VarDesc::SetLoDLevel)
      .def("type", &VarDesc::GetType)
@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
      .value("BLOCK", proto::AttrType::BLOCK);

  py::class_<OpDesc> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDesc::Type)
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
      .def("set_type", &OpDesc::SetType)
      .def("input", &OpDesc::Input)
      .def("input_names", &OpDesc::InputNames)
-      .def("set_input", &OpDesc::SetInput)
      .def("output", &OpDesc::Output)
      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
      .def("has_attr", &OpDesc::HasAttr)
      .def("attr_type", &OpDesc::GetAttrType)
      .def("attr_names", &OpDesc::AttrNames)

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -269,22 +269,21 @@ All parameter, weight, gradient are variables in Paddle.
    }
    return ret_values;
  });
-  m.def("get_grad_op_descs",
-        [](const OpDesc &op_desc,
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
                             const std::unordered_set<std::string> &no_grad_set,
-           std::unordered_map<std::string, std::string> &grad_to_var,
                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
            framework::OpInfoMap::Instance()
                .Get(op_desc.Type())
                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
                               grad_sub_block);
        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-          std::transform(
-              grad_op_descs.begin(), grad_op_descs.end(),
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
                       grad_op_desc_ptrs.begin(),
                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-          return grad_op_desc_ptrs;
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
      });
  m.def("prune", [](const ProgramDesc &origin,
                    const std::vector<std::array<size_t, 2>> &targets) {
@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
    return new ProgramDesc(pruned_desc);
  });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));

-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Borrow(tensor.place()));
+            pool.Get(tensor.place()));

        paddle::platform::GpuMemcpyAsync(
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
@@ -76,10 +77,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
        dst_tensor = tensor;
      }
-      return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
-          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(dst_tensor.dims()),
+                             dims_outside, strides);
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);

-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -178,7 +178,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -71,9 +71,7 @@ function threads_config() {
  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
  # according to trainer_count and total processors
  # only when MKL enabled
-  if [ "@WITH_MKL@" == "OFF" ]; then
-    return 0
-  fi
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
  if [ -z $trainers ]; then
@@ -83,12 +81,19 @@ function threads_config() {
  if [ $threads -eq 0 ]; then
    threads=1
  fi
+  if [ "@WITH_MKL@" == "ON" ]; then
    if [ -z "$OMP_NUM_THREADS" ]; then
      export OMP_NUM_THREADS=$threads
    fi
    if [ -z "$MKL_NUM_THREADS" ]; then
      export MKL_NUM_THREADS=$threads
    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+  fi
+  
 }

 PADDLE_CONF_HOME="$HOME/.config/paddle"
@@ -150,7 +155,7 @@ fi
 case "$1" in
    "train")
        threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
        ;;
    "merge_model")

--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
    """
    import sys
    import core
-    read_env_flags = ['use_pinned_memory']
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf']
    if core.is_compile_gpu():
        read_env_flags.append('fraction_of_gpu_memory_to_use')
    core.init_gflags([sys.argv[0]] +

--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
 from paddle.v2.fluid import framework as framework
+from . import core
+import collections

-__all__ = ['append_backward_ops']
+__all__ = ['append_backward']


-def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
    """
-    Create and add gradient Operators in BlockDesc to compute
-    gradients of `loss` for parameters in parameter_list
+    Traverse all ops in op_descs[begin_idx : end_idx], 
+    if any op has inputs/outputs named "old_name", rename it as 'new_name'
+    """
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_descs)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_descs[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+
+
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    """
+    Create a C++ OpDesc object with specified inputs, outputs and attributes.
+    """
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+
+
+def _infer_var_data_type_(grad_var_name, block):
+    """
+    Infer the data type of given grad variable
+    """
+    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+
+
+def _all_in_set_(cands, s):
+    """
+    Test if all elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+

-    :param loss: an variable generated by cost function.
-    :type loss: Variable
-    :param no_grad_set: variable that should not create gradient
-    :type no_grad_set: set
-    :param parameter_list: parameters that need to compute gradient and 
-    update to optimize the lost.
-    :type: list
-    :return: list of (parameters, gradients) pair.
-    :rtype: list[Variable]
+def _strip_grad_suffix_(name):
+    """
+    Strip the grad suffix from the given varibale name
+    e.g. x@GRAD ==> x
+         y@GRAD@RENAME@1 ==> y
+    """
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+
+
+def _append_grad_suffix_(name):
+    """
+    Append grad suffix to the given variable name
+    e.g. x ==> x@GRAD
+    """
+    return name + core.grad_var_suffix()
+
+
+def _addup_repetitive_outputs_(op_descs):
+    """
+    In backward part, an variable may be the output of more than one ops.
+    In this case, the variable should be the accumulation of all the outputs.
+    `sum_op`s are added to implement the accumulate.
+    """
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+
+    return op_descs
+
+
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    """
+    Remove unnecessary grad ops
+    A grad op can be removed in two cases:
+        1. all outputs of the grad op are in 'no_grad_set'
+        2. all grad inputs of the grad op are in 'no_grad_set'
+    """
+
+    def _op_can_be_removed_(op_desc, no_grad_set):
+        out_arg_names = op_desc.output_arg_names()
+        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
+            return True
+        if _all_in_set_(
+                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                       op_desc.input_arg_names()), no_grad_set):
+            no_grad_set.union(out_arg_names)
+            return True
+        return False
+
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Y": [arg]}, {}), idx))
+
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+
+    return op_descs
+
+
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+    # grad_op_descs holds created grad_op, and will be appended to target_block
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(block.ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(target, sub_block, grad_sub_block,
+                                  no_grad_dict, grad_to_var, callback)
+            grad_sub_block_list.append(grad_sub_block.desc)
+
+        # Getting op's corresponding grad_op
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+
+    if target_block.idx == 0:
+        grad_op_descs.insert(
+            0,
+            _create_op_desc_("fill_constant", {}, {
+                "Out": [_append_grad_suffix_(target.name)]
+            }, {"shape": [1],
+                "value": 1.0,
+                "dtype": target.dtype}))
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+
+
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by cost function.
+        parameter_list(list): Parameters that need to be updated by optimizer.
+            If None, it means all parameters need to be updated.
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
    """
    assert isinstance(loss, framework.Variable)

-    if no_grad_set is None:
    program = loss.block.program
+    no_grad_dict = dict()
+    if no_grad_set is None:
        assert isinstance(program, framework.Program)
-        no_grad_set = list()
        for block in program.blocks:
            assert isinstance(block, framework.Block)
+            block_no_grad_set = set()
            for var in block.vars.itervalues():
                assert isinstance(var, framework.Variable)
                if var.stop_gradient:
-                    no_grad_set.append(var.name)
-        no_grad_set = set(no_grad_set)
+                    block_no_grad_set.add(_append_grad_suffix_(var.name))
+            no_grad_dict[block.idx] = block_no_grad_set
+    elif isinstance(no_grad_set, set):
+        no_grad_dict = {
+            0: set([_append_grad_suffix_(name) for name in no_grad_set])
+        }
+    else:
+        raise ValueError("'no_grad_set' should be a set or None.")
+
+    grad_info_map = dict()
+    root_block = program.block(0)
+
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+
+    _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
+                          grad_to_var)
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()

-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
    if parameter_list is not None:
        parameters = parameter_list
    else:
-        params = loss.block.program.global_block().all_parameters()
+        params = program.global_block().all_parameters()
        parameters = [param.name for param in params]
    params_and_grads = []
    for param in parameters:
-        if param not in param_grad_map:
+        if param not in grad_info_map:
            raise ValueError("param %s is not in map" % param)
-        grad_info = param_grad_map[param]
-        grad_block = loss.block.program.block(grad_info[1])
+        grad_info = grad_info_map[param]
+        grad_block = grad_info[1]
        if not grad_block.has_var(grad_info[0]):
            raise ValueError("grad block[{0}] did not have grad var {1}".format(
                grad_info[1], grad_info[0]))
        # Get the param var from the global block
-        param_var = loss.block.program.global_block().var(param)
+        param_var = program.global_block().var(param)
        grad_var = grad_block.var(grad_info[0])
        if loss.block.has_var(grad_info[0]):
            params_and_grads.append((param_var, grad_var))

--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -3,7 +3,7 @@ import core
 import numpy
 import six.moves as six

-from framework import Variable
+from framework import Variable, default_main_program

 __all__ = ['DataFeeder']

@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):


 class DataFeeder(object):
-    def __init__(self, feed_list, place):
+    def __init__(self, feed_list, place, program=None):
        self.feed_dtypes = []
        self.feed_names = []
        self.feed_shapes = []
        self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
        for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)

--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -95,7 +95,9 @@ class DistributeTranspiler:
        """
        if program is None:
            program = default_main_program()
+        self.program = program
        self.trainers = trainers
+        self.optimize_ops = optimize_ops
        self._optimize_distributed(
            optimize_ops,
            program,
@@ -156,9 +158,10 @@ class DistributeTranspiler:
            attrs={"endpoints": pserver_endpoints,
                   "epmap": epmap})

-    def get_trainer_program(optimize_ops, program):
+    def get_trainer_program(self):
        # remove optimize ops and add a send op to main_program
-        program.global_block().delete_ops(optimize_ops)
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program

    def _create_var_for_trainers(self, block, var, trainers):
        var_list = []
@@ -210,7 +213,6 @@ class DistributeTranspiler:

            if opt_op.inputs.has_key("Grad"):
                if opt_op.inputs["Grad"].name in grad_var_names:
-                    print "appending ", opt_op.type, opt_op.inputs
                    optimize_sub_program.global_block().append_op(
                        type=opt_op.type,
                        inputs=opt_op.inputs,

--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
 import numpy as np
+import contextlib
+from framework import Program, default_main_program
 from . import core
-from framework import Program, default_main_program, Parameter, Variable

-__all__ = ['Executor', 'g_scope']
+__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']

 g_scope = core.Scope()


+def global_scope():
+    return g_scope
+
+
+def switch_scope(scope):
+    global g_scope
+    ex = g_scope
+    g_scope = scope
+    return ex
+
+
+@contextlib.contextmanager
+def scope_guard(scope):
+    ex = switch_scope(scope)
+    yield
+    switch_scope(ex)
+
+
 def as_numpy(tensor):
    if isinstance(tensor, list):
        return [as_numpy(t) for t in tensor]
@@ -117,7 +136,7 @@ class Executor(object):
            raise TypeError()

        if scope is None:
-            scope = g_scope
+            scope = global_scope()

        program = program.clone()
        global_block = program.global_block()

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -663,7 +663,7 @@ class Block(object):
            end = list(self.ops).index(ops[-1])
        except Exception, e:
            raise e
-        self.desc.remove_op(start, end)
+        self.desc.remove_op(start, end + 1)

    def prepend_op(self, *args, **kwargs):
        op_desc = self.desc.prepend_op()
@@ -846,9 +846,11 @@ class Program(object):
        self.sync_with_cpp()
        return param_to_grad_info

-    def create_block(self):
+    def create_block(self, parent_idx=None):
        new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
        self.current_block_idx = new_block_idx
        self.blocks.append(Block(self, self.current_block_idx))
        return self.current_block()

--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -188,7 +188,7 @@ def save_inference_model(dirname,
            raise ValueError("'feed_var_names' should be a list of str.")

    if isinstance(target_vars, Variable):
-        feeded_var_names = [feeded_var_names]
+        target_vars = [target_vars]
    else:
        if not (bool(target_vars) and all(
                isinstance(var, Variable) for var in target_vars)):

--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -16,6 +16,36 @@ __all__ = [


 def split_lod_tensor(input, mask, level=0):
+    """
+    **split_lod_tensor**
+
+    This function takes in an input that contains the complete lod information,
+    and takes in a mask which is used to mask certain parts of the input.
+    The output is the true branch and the false branch with the mask applied to
+    the input at a certain level in the tensor.
+
+    Args:
+        input(tuple|list|None): The input tensor that contains complete
+                                lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The true branch of tensor as per the mask applied to input.
+        Variable: The false branch of tensor as per the mask applied to input.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(name='x', shape=[1])
+          x.persistable = True
+
+          y = layers.data(name='y', shape=[1])
+          y.persistable = True
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+    """
    helper = LayerHelper('split_lod_tensor', **locals())
    out_true = helper.create_tmp_variable(dtype=input.dtype)
    out_false = helper.create_tmp_variable(dtype=input.dtype)
@@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0):


 def merge_lod_tensor(in_true, in_false, x, mask, level=0):
+    """
+    **merge_lod_tensor**
+
+    This function takes in an input :math:`x`, the True branch, the False
+    branch and a binary :math:`mask`. Using this information, this function
+    merges the True and False branches of the tensor into a single Output
+    at a certain lod level indiacted by :math:`level`.
+
+    Args:
+        in_true(tuple|list|None): The True branch to be merged.
+        in_false(tuple|list|None): The False branch to be merged.
+        x(tuple|list|None): The input tensor that contains complete
+                            lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The merged output tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(
+                      name='x', shape=[1], dtype='float32', stop_gradient=False)
+          y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+          level = 0
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+          out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+    """
    helper = LayerHelper('merge_lod_tensor', **locals())
    out = helper.create_tmp_variable(dtype=in_true.dtype)
    helper.append_op(
@@ -397,9 +461,50 @@ class While(object):


 def lod_rank_table(x, level=0):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
+    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
+    contains a list of bi-element tuples. Each tuple consists of an index and
+    a length, both of which are int type. Reffering to specified level of LoD,
+    the index is the sequence index number and the length representes the
+    sequence length. Please note that the list is ranked in descending order by
+    the length. The following is an example:
+
+        .. code-block:: text
+
+            x is a LoDTensor:
+                x.lod = [[0,                2, 3],
+                         [0,             5, 6, 7]]
+                x.data = [a, b, c, d, e, f, g]
+
+            1. set level to 0:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=0)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
+
+            2. set level to 1:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=1)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
+
+    Args:
+        x (Variable): Input variable, a LoDTensor based which to create the lod
+            rank table.
+        level (int): Specify the LoD level, on which to create the lod rank
+            table.
+
+    Returns:
+        Variable: The created LoDRankTable object.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            out = layers.lod_rank_table(x=x, level=0)
    """
    helper = LayerHelper("lod_rank_table", **locals())
    table = helper.create_variable(
@@ -414,9 +519,25 @@ def lod_rank_table(x, level=0):


 def max_sequence_len(rank_table):
-    """
-    This function creates an operator to calculate the length of
-    max seqence through input rank_table(should be a lod_rank_table)
+    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    object contains a list of tuples(<sequence index, sequence length>) and
+    the list is already sorted by sequence length in descending order, so the
+    operator just returns the sequence length of the first tuple element.
+
+    Args:
+        rank_table (Variable): Input variable which is a LoDRankTable object.
+
+    Returns:
+        Variable: The max length of sequence.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            rank_table = layers.lod_rank_table(x=x, level=0)
+            max_seq_len = layers.max_sequence_len(rank_table)
    """
    helper = LayerHelper("max_seqence_len", **locals())
    res = helper.create_tmp_variable(dtype="int64")
@@ -428,6 +549,30 @@ def max_sequence_len(rank_table):


 def topk(input, k):
+    """
+    **topk**
+
+    This function performs the operation that selects the k entries in the input
+    vector and outputs their values and indices as vectors. Thus topk_out[j] is
+    the j-th largest entry in input, and its index is topk_indices[j]
+
+    Args:
+        input (Variable|list): The input tensor that has all the data.
+        k (int): The number of top elements that the function will pick.
+
+    Returns:
+        Variable: The variable of type array that contains the k largest entries
+                  from input.
+        Variable: The variable of type array that contains the indices of k
+                  largest entries from input.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          k = 5
+          array = fluid.layers.topk(x, k)
+    """
    helper = LayerHelper('topk', **locals())
    topk_out = helper.create_tmp_variable(dtype=input.data_type)
    topk_indices = helper.create_tmp_variable(dtype='int64')

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -270,6 +270,7 @@ def gru_unit(input,
            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)

    # create bias
+
    if bias is None:
        bias_size = [1, 3 * size]
        bias = helper.create_parameter(
@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):

 def cross_entropy(input, label, **kwargs):
    """
-    This function computes cross_entropy using the input and label.
+    **Cross Entropy Layer**
+
+    This layer computes the cross entropy between `input` and `label`. It supports
+    both standard cross-entropy and soft-label cross-entropy loss computation.
+
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+
+        .. math::
+
+            Y[i] = -\log(X[i, Label[i]])
+
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+
+        .. math::
+
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+            batch size and D is the number of classes. This input is a probability
+            computed by the previous operator, which is almost always the result
+            of a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape
+              [N x 1]. When `soft_label` is set to `True`, `label` is a
+              tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
+              the given labels as soft labels, default `False`.
+
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
+              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
+               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+
+    Examples:
+        .. code-block:: python
+
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
    """
    helper = LayerHelper('cross_entropy', **kwargs)
    out = helper.create_tmp_variable(dtype=input.dtype)
@@ -373,8 +426,36 @@ def cross_entropy(input, label, **kwargs):

 def square_error_cost(input, label, **kwargs):
    """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
+    **Square error cost layer**
+
+    This layer accepts input predictions and target label and returns the squared error cost.
+    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
+
+    .. math::
+
+        Out = (X - Y)^2
+
+    In the above equation:
+
+        * :math:`X`: Input predictions, a tensor.
+        * :math:`Y`: Input labels, a tensor.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    Args:
+       input(Variable): Input tensor, has predictions.
+       label(Variable): Label tensor, has target labels.
+
+    Returns:
+        Variable: The tensor variable storing the element-wise squared error difference \
+                  of input and label.
+
+    Examples:
+        .. code-block:: python
+
+          y = layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = layers.square_error_cost(input=y_predict, label=y)
+
    """
    helper = LayerHelper('square_error_cost', **kwargs)
    minus_out = helper.create_tmp_variable(dtype=input.dtype)
@@ -514,14 +595,83 @@ def conv2d(input,
           groups=None,
           param_attr=None,
           bias_attr=None,
-           act=None,
-           name=None):
+           act=None):
    """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
+    **Convlution2D Layer**
+
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
+    of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the output of the convolution,
+    and the corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+        * :math:`X`: Input value, a tensor with NCHW format.
+        * :math:`W`: Filter value, a tensor with MCHW format.
+        * :math:`\\ast`: Convolution operation.
+        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+        * :math:`\\sigma`: Activation function.
+        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        Input:
+            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        Output:
+            Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+    .. math::
+
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        groups(int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act(str): Activation type. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
    """

    if stride is None:
@@ -1018,25 +1168,26 @@ def lstm_unit(x_t,

        .. math::

-            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)

-            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)

-            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)

-            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)

            h_t & = o_t tanh(c_t)

-    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
-    :math:`c_{t-1}`. The implementation separates the linear transformation
-    and non-linear transformation apart. Here, we take :math:`i_t` as an
-    example. The linear transformation is applied by calling a `fc` layer and
-    the equation is:
+    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
+    should be same. The implementation separates the linear transformation and
+    non-linear transformation apart. Here, we take :math:`i_t` as an example.
+    The linear transformation is applied by calling a `fc` layer and the
+    equation is:

        .. math::

-            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i

    The non-linear transformation is applied by calling `lstm_unit_op` and the
    equation is:
@@ -1048,9 +1199,12 @@ def lstm_unit(x_t,
    This layer has two outputs including :math:`h_t` and :math:`o_t`.

    Args:
-        x_t (Variable): The input value of current step.
-        hidden_t_prev (Variable): The hidden value of lstm unit.
-        cell_t_prev (Variable): The cell value of lstm unit.
+        x_t (Variable): The input value of current step, a 2-D tensor with shape
+            M x N, M for batch size and N for input size.
+        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
+            with shape M x S, M for batch size and S for size of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
+            shape M x S, M for batch size and S for size of lstm unit.
        forget_bias (float): The forget bias of lstm unit.
        param_attr (ParamAttr): The attributes of parameter weights, used to set
            initializer, name etc.
@@ -1063,14 +1217,15 @@ def lstm_unit(x_t,
    Raises:
        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
-                and **cell_t_prev** not be the same.
+                and **cell_t_prev** not be the same or the 2nd dimensions of \
+                **hidden_t_prev** and **cell_t_prev** not be the same.

    Examples:

        .. code-block:: python

             x_t = fluid.layers.fc(input=x_t_data, size=10)
-             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
                                                    hidden_t_prev=prev_hidden,
@@ -1089,7 +1244,11 @@ def lstm_unit(x_t,

    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
            0] != cell_t_prev.shape[0]:
-        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
                         "cell_t_prev must be the same.")

    if bias_attr is None:

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input,

 def ones(shape, dtype):
    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
+    **ones**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 1.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.ones(shape=[1], dtype='int64')
    """
    return fill_constant(value=1.0, **locals())


 def zeros(shape, dtype):
    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
+    **zeros**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 0.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.zeros(shape=[1], dtype='int64')
    """
    return fill_constant(value=0.0, **locals())
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
 from collections import defaultdict

 import framework
-from backward import append_backward_ops
+from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
@@ -194,10 +194,10 @@ class Optimizer(object):
                 no_grad_set=None):
        """Add operations to minimize `loss` by updating `parameter_list`.

-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
        `create_optimization_pass()` into one.
        """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+        params_grads = append_backward(loss, parameter_list, no_grad_set)

        params_grads = append_gradient_clip_ops(params_grads)


--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -170,7 +170,7 @@ def main():

    exe.run(fluid.default_startup_program())

-    embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor()
+    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
    embedding_param.set(
        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)


--- a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
@@ -38,35 +38,43 @@ train_reader = paddle.batch(

 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+
 t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
 pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
 training_role = os.getenv("TRAINING_ROLE",
                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)

 if training_role == "PSERVER":
-    pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
    exe.run(fluid.default_startup_program())
    exe.run(pserver_prog)
 elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
    exe.run(fluid.default_startup_program())

    for pass_id in range(PASS_NUM):
        accuracy.reset(exe)
+        batch_id = 0
        for data in train_reader():
-            loss, acc = exe.run(fluid.default_main_program(),
+            loss, acc = exe.run(trainer_prog,
                                feed=feeder.feed(data),
                                fetch_list=[avg_cost] + accuracy.metrics)
            pass_acc = accuracy.eval(exe)
-            # print loss, acc
-            if loss < 10.0 and pass_acc > 0.9:
-                # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-                exit(0)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1

        pass_acc = accuracy.eval(exe)
        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
 else:
    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-exit(1)
--- a/python/paddle/v2/fluid/tests/decorators.py
+++ b/python/paddle/v2/fluid/tests/decorators.py
+import paddle.v2.fluid as fluid
+
+__all__ = ['many_times', 'prog_scope']
+
+
+def many_times(times):
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            for _ in range(times):
+                fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
+
+
+def prog_scope():
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            prog = fluid.Program()
+            startup_prog = fluid.Program()
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(prog, startup_prog):
+                    fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -4,7 +4,7 @@ import random
 import itertools
 import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.framework import Program, OpProtoHolder
@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
            op_loss.desc.infer_var_type(block.desc)
            op_loss.desc.infer_shape(block.desc)

-        param_grad_list = append_backward_ops(
+        param_grad_list = append_backward(
            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)

        feed_dict = {

--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.v2.fluid import core
+from paddle.v2.fluid.op import Operator


 class TestAdamOp1(OpTest):
@@ -176,5 +178,124 @@ def adam_step(inputs, attributes):
    return param_out, moment1_out, moment2_out


+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    # grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = np.zeros(shape=[height, row_numel])
+    moment2_out = np.zeros(shape=[height, row_numel])
+    param_out = np.zeros(shape=[height, row_numel])
+
+    for idx, row_id in enumerate(rows):
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
+                                                         ) * np_grad[idx]
+        moment2_out[row_id] = beta2 * moment2[row_id] + (
+            1 - beta2) * np.square(np_grad[idx])
+        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
+            np.sqrt(moment2_out[row_id]) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestSparseAdamOp(unittest.TestCase):
+    def setup(self, scope, place):
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+
+        height = 10
+        rows = [0, 4, 7]
+        self.rows = rows
+        row_numel = 12
+        self.row_numel = row_numel
+        self.dense_inputs = {
+            "Param": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
+            'Beta1Pow': np.array([beta1**10]).astype("float32"),
+            'Beta2Pow': np.array([beta2**10]).astype("float32"),
+            "LearningRate": np.full((1), 2.0).astype("float32")
+        }
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        self.sparse_inputs = ["Grad"]
+
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+        self.outputs = {
+            "ParamOut": param_out,
+            "Moment1Out": mom1,
+            "Moment2Out": mom2
+        }
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.setup(scope, place)
+
+        op_args = dict()
+        for key, np_array in self.dense_inputs.iteritems():
+            var = scope.var(key).get_tensor()
+            var.set(np_array, place)
+            op_args[key] = key
+        for s in self.sparse_inputs:
+            op_args[s] = s
+        for s in self.outputs:
+            var = scope.var(s).get_tensor()
+            var.set(self.outputs[s], place)
+            op_args[s] = s
+        for k in self.attrs:
+            op_args[k] = self.attrs[k]
+
+        # create and run sgd operator
+        adam_op = Operator("adam", **op_args)
+        adam_op.run(scope, place)
+
+        for key, np_array in self.outputs.iteritems():
+            out_var = scope.var(key).get_tensor()
+            actual = np.array(out_var)
+            actual = actual.reshape([actual.size])
+            np_array = np_array.reshape([np_array.size])
+            for idx, row_id in enumerate(self.rows):
+                j = 0
+                while j < self.row_numel:
+                    pos = row_id * self.row_numel + j
+                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
+                                    0.00001)
+                    j += 1
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy

@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

-        append_backward_ops(total_sum_scaled)
+        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])

--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy


@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
        print outs
        loss = layers.mean(x=out)
-        append_backward_ops(loss=loss)
+        append_backward(loss=loss)
        outs = exe.run(
            feed={'X': x},
            fetch_list=[

--- a/python/paddle/v2/fluid/tests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "detection_output"
+        self.init_test_case()
+
+        #loc.shape ((1, 4, 4, 1, 1))
+        #conf.shape ((1, 4, 2, 1, 1))
+
+        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
+        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
+                          [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
+        priorbox = np.array([
+            0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
+            0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
+            0.8, 0.8, 0.1, 0.1, 0.2, 0.2
+        ])
+
+        output = np.array([
+            0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
+        ])
+        self.inputs = {
+            'Loc': loc.astype('float32'),
+            'Conf': conf.astype('float32'),
+            'PriorBox': priorbox.astype('float32')
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'top_k': self.top_k,
+            'nms_top_k': self.nms_top_k,
+            'background_label_id': self.background_label_id,
+            'nms_threshold': self.nms_threshold,
+            'confidence_threshold': self.confidence_threshold,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.num_classes = 2
+        self.top_k = 10
+        self.nms_top_k = 20
+        self.background_label_id = 0
+        self.nms_threshold = 0.01
+        self.confidence_threshold = 0.01
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
+import numpy
+import random
+import collections
+import paddle.v2.fluid as fluid
+import unittest
+from decorators import *
+
+
+class Memory(object):
+    def __init__(self, shape, dtype='float32'):
+        self.ex = numpy.zeros(shape=shape, dtype=dtype)
+        self.cur = None
+
+    def update(self, val):
+        assert val.shape == self.ex.shape
+        assert val.dtype == self.ex.dtype
+        self.cur = val
+
+    def ex(self):
+        return self.ex
+
+    def next(self):
+        self.ex = self.cur
+        self.cur = None
+
+    def __next__(self):
+        self.next()
+
+    def reset(self):
+        self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
+        self.cur = None
+
+
+class Output(object):
+    def __init__(self):
+        self.outs = []
+
+    def next_sequence(self):
+        self.outs.append([])
+
+    def out(self, val):
+        self.outs[-1].append(val)
+
+    def last(self):
+        return self.outs[-1][-1]
+
+
+class BaseRNN(object):
+    def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
+        self.num_seq = num_seq
+        self.inputs = collections.defaultdict(list)
+
+        for _ in xrange(num_seq):
+            seq_len = random.randint(1, max_seq_len - 1)
+            for iname in ins:
+                ishape = ins[iname].get('shape', None)
+                idtype = ins[iname].get('dtype', 'float32')
+                lst = []
+                for _ in xrange(seq_len):
+                    lst.append(numpy.random.random(size=ishape).astype(idtype))
+                self.inputs[iname].append(lst)
+
+        self.mems = dict()
+        for mname in mems:
+            mshape = mems[mname].get('shape', None)
+            mdtype = mems[mname].get('dtype', 'float32')
+            self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
+
+        self.params = dict()
+        for pname in params:
+            pshape = params[pname].get('shape', None)
+            pdtype = params[pname].get('dtype', 'float32')
+            self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
+
+        self.outputs = dict()
+
+        for oname in outs:
+            self.outputs[oname] = Output()
+
+    def step(self, **kwargs):
+        raise NotImplementedError()
+
+    def exe(self):
+        retv = dict()
+        for out in self.outputs:
+            retv[out] = []
+
+        for seq_id in xrange(self.num_seq):
+            for mname in self.mems:
+                self.mems[mname].reset()
+            for out in self.outputs:
+                self.outputs[out].next_sequence()
+
+            iname0 = self.inputs.keys()[0]
+            seq_len = len(self.inputs[iname0][seq_id])
+
+            for step_id in xrange(seq_len):
+                xargs = dict()
+
+                for iname in self.inputs:
+                    xargs[iname] = self.inputs[iname][seq_id][step_id]
+
+                for mname in self.mems:
+                    xargs[mname] = self.mems[mname]
+
+                for pname in self.params:
+                    xargs[pname] = self.params[pname]
+
+                for out in self.outputs:
+                    xargs[out] = self.outputs[out]
+
+                self.step(**xargs)
+
+                for mname in self.mems:
+                    next(self.mems[mname])
+
+            for out in self.outputs:
+                retv[out].append(self.outputs[out].last())
+
+        for out in retv:
+            retv[out] = numpy.array(retv[out])
+        return retv
+
+    def to_feed(self, place):
+        feed_dict = dict()
+
+        for iname in self.inputs:
+            lod = [0]
+            np_flatten = []
+            for seq_id in xrange(len(self.inputs[iname])):
+                seq_len = len(self.inputs[iname][seq_id])
+                lod.append(lod[-1] + seq_len)
+                np_flatten.extend(self.inputs[iname][seq_id])
+
+            t = fluid.Tensor()
+            t.set(numpy.array(np_flatten), place)
+            t.set_lod([lod])
+            feed_dict[iname] = t
+
+        for pname in self.params:
+            feed_dict[pname] = self.params[pname]
+        return feed_dict
+
+    def get_numeric_gradient_of_param(self, param_name, delta=0.001):
+        p = self.params[param_name]
+        if len(p.shape) != 2:
+            raise ValueError("Not support get numeric gradient of an parameter,"
+                             " which is not matrix")
+        g = numpy.zeros(shape=p.shape, dtype=p.dtype)
+
+        for i in xrange(p.shape[0]):
+            for j in xrange(p.shape[1]):
+                o = p[i][j]
+                p[i][j] += delta
+                pos = self._exe_mean_out_()
+                p[i][j] -= 2 * delta
+                neg = self._exe_mean_out_()
+                p[i][j] = o
+                g[i][j] = (pos - neg) / (delta * 2)
+        return g
+
+    def get_numeric_gradient_of_input(self,
+                                      input_name,
+                                      delta=0.001,
+                                      return_one_tensor=True):
+        ipt = self.inputs[input_name]
+        grad = []
+
+        for seq in ipt:
+            seq_grad = []
+            for item in seq:
+                item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
+                if len(item.shape) != 1:
+                    raise ValueError("Not support")
+
+                for i in xrange(len(item)):
+                    o = item[i]
+                    item[i] += delta
+                    pos = self._exe_mean_out_()
+                    item[i] -= 2 * delta
+                    neg = self._exe_mean_out_()
+                    item[i] = o
+                    item_grad[i] = (pos - neg) / (delta * 2)
+                seq_grad.append(item_grad)
+            grad.append(seq_grad)
+
+        if not return_one_tensor:
+            return grad
+
+        for i in xrange(len(grad)):
+            grad[i] = numpy.concatenate(grad[i])
+        grad = numpy.concatenate(grad)
+        return grad
+
+    def _exe_mean_out_(self):
+        outs = self.exe()
+        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+
+
+class TestSimpleMul(unittest.TestCase):
+    DATA_NAME = 'X'
+    DATA_WIDTH = 32
+    PARAM_NAME = 'W'
+    HIDDEN_WIDTH = 10
+    OUT_NAME = 'Out'
+
+    class SimpleMul(BaseRNN):
+        def __init__(self):
+            base = TestSimpleMul
+            super(base.SimpleMul, self).__init__({
+                base.DATA_NAME: {
+                    'shape': [base.DATA_WIDTH]
+                }
+            }, {}, {
+                base.PARAM_NAME: {
+                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
+                }
+            }, [base.OUT_NAME])
+
+        def step(self, X, W, Out):
+            Out.out(numpy.matmul(X, W))
+
+    # Test many times in local to ensure the random seed cannot breaks CI
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMul.SimpleMul()
+        dat = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        dat.stop_gradient = False
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(dat)
+            o = fluid.layers.fc(input=d,
+                                param_attr=self.PARAM_NAME,
+                                bias_attr=False,
+                                size=self.HIDDEN_WIDTH,
+                                act=None)
+            rnn.output(o)
+
+        out = rnn()
+        out = fluid.layers.sequence_pool(out, pool_type='last')
+        loss = fluid.layers.mean(x=out)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        out, w_g, i_g = map(numpy.array,
+                            exe.run(feed=py_rnn.to_feed(cpu),
+                                    fetch_list=[
+                                        out, self.PARAM_NAME + "@GRAD",
+                                        self.DATA_NAME + "@GRAD"
+                                    ],
+                                    return_numpy=False))
+        out_by_python = py_rnn.exe()[self.OUT_NAME]
+        self.assertTrue(numpy.allclose(out, out_by_python))
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(
+            input_name=self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
+
+
+class TestSimpleMulWithMemory(unittest.TestCase):
+    DATA_WIDTH = 32
+    HIDDEN_WIDTH = 20
+    DATA_NAME = 'X'
+    PARAM_NAME = 'W'
+
+    class SimpleMulWithMemory(BaseRNN):
+        def __init__(self):
+            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
+                TestSimpleMulWithMemory.DATA_NAME: {
+                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
+                }
+            }, {'Mem': {
+                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
+            }}, {
+                TestSimpleMulWithMemory.PARAM_NAME: {
+                    'shape': [
+                        TestSimpleMulWithMemory.DATA_WIDTH,
+                        TestSimpleMulWithMemory.HIDDEN_WIDTH
+                    ]
+                }
+            }, ['Out'])
+
+        def step(self, X, Mem, W, Out):
+            o = numpy.matmul(X, W)
+            assert isinstance(Mem, Memory)
+            o += Mem.ex
+            Mem.update(o)
+            assert isinstance(Out, Output)
+            Out.out(o)
+
+    # many_times used locally for debug. Make sure the calculation is stable.
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
+        data = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        data.stop_gradient = False
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(data)
+            mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
+            hidden = fluid.layers.fc(input=d,
+                                     size=self.HIDDEN_WIDTH,
+                                     param_attr=self.PARAM_NAME,
+                                     bias_attr=False,
+                                     act=None)
+            o = fluid.layers.elementwise_add(x=hidden, y=mem)
+            rnn.update_memory(mem, o)
+            rnn.output(o)
+
+        out = rnn()
+        last = fluid.layers.sequence_pool(input=out, pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        feed = py_rnn.to_feed(cpu)
+        last_np, w_g, i_g = map(numpy.array,
+                                exe.run(feed=feed,
+                                        fetch_list=[
+                                            last, self.PARAM_NAME + "@GRAD",
+                                            self.DATA_NAME + "@GRAD"
+                                        ],
+                                        return_numpy=False))
+        last_by_py, = py_rnn.exe().values()
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(last_np, last_by_py))
+
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+
+        # Since this RNN has many float add. The number could be not stable.
+        # rtol = 0.1
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase):
                name='x_t_data', shape=[10, 10], dtype='float32')
            x_t = layers.fc(input=x_t_data, size=10)
            prev_hidden_data = layers.data(
-                name='prev_hidden_data', shape=[10, 20], dtype='float32')
-            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
            prev_cell_data = layers.data(
                name='prev_cell', shape=[10, 30], dtype='float32')
            prev_cell = layers.fc(input=prev_cell_data, size=30)

--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -4,7 +4,7 @@ import numpy
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward


 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):

            mean = layers.mean(x=result)

-            append_backward_ops(mean)
+            append_backward(mean)

        tensor = core.LoDTensor()
        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)

--- a/python/paddle/v2/fluid/tests/test_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_norm_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def norm(input, scale, epsilon):
+    s0, s1, s2, s3 = input.shape
+    x_square = input * input
+    for i in xrange(s0):
+        input_batch = input[i:i + 1, :, :, :]
+        input_batch = input_batch.reshape(s1, s2 * s3)
+        x_square_batch = x_square[i:i + 1, :, :, :]
+        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
+        square_colsum = x_square_batch.sum(axis=0) + epsilon
+        tmp = pow(square_colsum, 0.5)
+        tmp = np.reciprocal(tmp)
+        tmp_tile = np.tile(tmp, s1)
+        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
+        scale_tile = np.tile(scale, (1, s2 * s3))
+        scale_tile = scale_tile.reshape(s1, s2 * s3)
+        out_batch = input_batch * tmp_tile * scale_tile
+        out_batch = out_batch.reshape(1, s1, s2, s3)
+        if i == 0:
+            out = out_batch
+        else:
+            out = np.concatenate((out, out_batch), 0)
+    out.reshape(s0, s1, s2, s3)
+    return out
+
+
+class TestNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "norm"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        scale = np.array([10, 10, 10])
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Scale': scale.astype('float32')
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        output = norm(input, scale, self.epsilon)
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.shape = [2, 3, 2, 2]
+        self.epsilon = 1e-6
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -2,7 +2,7 @@ import unittest

 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward


 class TestOptimizer(unittest.TestCase):
@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
        opts = momentum_optimizer.create_optimization_pass(
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        learning_rate = 0.01
        momentum_optimizer = self.MockMomentum(
            learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
        opts = momentum_optimizer.create_optimization_pass(
@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adagrad_optimizer = self.MockAdagrad(
            learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adam_optimizer = self.MockAdam(
            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adamax_optimizer = self.MockAdamax(
            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        learning_rate = 0.01
        decayed_adagrad_optimizer = self.MockDecayedAdagrad(
            learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
        opts = decayed_adagrad_optimizer.create_optimization_pass(

--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core

@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
    def test_backward(self):
        self.check_forward()

-        append_backward_ops(self.output)
+        append_backward(self.output)

        ana_grad = [np.array(x) for x in self.backward()]


--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward


 class TestL2DecayRegularizer(unittest.TestCase):
@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        count_ops = len(block.ops)
        params_grads = optimizer.append_regularization_ops(params_grads)
@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        count_ops = len(block.ops)
        params_grads = optimizer.append_regularization_ops(params_grads)

--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
            x=dat, rank_table=table)
        loss = fluid.layers.mean(x=new_dat)
-        fluid.backward.append_backward_ops(loss=loss)
+        fluid.backward.append_backward(loss=loss)

        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)

--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -2,7 +2,7 @@ import unittest

 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core


--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy

@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))

        mem3_mean = layers.mean(x=mem3)
-        append_backward_ops(loss=mem3_mean)
+        append_backward(loss=mem3_mean)
        x_grad = exe.run(
            feed={'x': tensor},
            fetch_list=[main_program.global_block().var('x@GRAD')])[0]

--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -4,7 +4,7 @@ import numpy as np
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward


 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
            mean = layers.mean(x=out)

-            append_backward_ops(mean)
+            append_backward(mean)

        tensor = core.LoDTensor()
        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)

--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy


@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
        sum_result = layers.array_read(array=mem_array, i=i)
        loss = layers.mean(x=sum_result)

-        append_backward_ops(loss)
+        append_backward(loss)

        cpu = core.CPUPlace()
        exe = Executor(cpu)