diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 6cc9598947acbdacfbf4c4379987bab8ed7611b0..084d3237d9cfe9ca4837f77cf5f70a2449cfcc03 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -93,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 | MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
 | MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
 
+- Alexnet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  |    |   |   |   |    |
+| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
+| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+chart TBD
 
 ### Laptop
 TBD
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 77d130ae34059d1e87040d00346ac1dadd86b0d8..cad6051f1413a5bb95f87a940f3aa81e49e5d282 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -19,7 +19,11 @@ args = {
     'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index da034f3b9dff794e22086a5295ad2b0c2361c356..71a49231a5527ebee9f45d5f4650ce2a4f6a1c31 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,15 +8,19 @@ function clock_to_seconds() {
 }
 
 function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
   topology=$1
   layer_num=$2
   bs=$3
-  thread=`nproc`
-  if [ $thread -gt $bs ]; then
-    thread=$bs
+  trainers=`nproc`
+  if [ $trainers -gt $bs ]; then
+    trainers=$bs
   fi
-  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+  threads=$((`nproc` / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  export OPENBLAS_NUM_THREADS=$threads
 
   models_in="models/${topology}-${layer_num}/pass-00000/"
   if [ ! -d $models_in ]; then
@@ -28,7 +32,7 @@ function infer() {
     --config="${topology}.py" \
     --use_mkldnn=False \
     --use_gpu=False \
-    --trainer_count=$thread \
+    --trainer_count=$trainers \
     --log_period=$log_period \
     --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
     --init_model_path=$models_in \
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index e9df83fee2a3f796b7234b39619364f6ee4d5dc9..935cff6f2c97d25d6de556cfee25e27dbe49b5b6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,7 +1,7 @@
 set -e
 
 function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  export OPENBLAS_NUM_THREADS=1
   topology=$1
   layer_num=$2
   bs=$3
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d81481ca819c13ee0e299c204f998f3915c34bd4..ddf0b055a92d80295b24255a5462d477e0d9c796 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -252,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
     :noindex:
 
+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
+
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 939731c0f3438a702e947ba1a7abeb5e3e6a8f53..004ee2d8c85ce7661886179570e693d7d61bc6d8 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -68,12 +68,6 @@ scale
     :noindex:
 
 
-reshape
----------
-..  autofunction:: paddle.v2.fluid.layers.reshape
-    :noindex:
-
-
 transpose
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.transpose
diff --git a/doc/design/backward.md b/doc/design/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/design/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
diff --git a/paddle/framework/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op.graffle
rename to doc/design/images/duplicate_op.graffle
diff --git a/paddle/framework/images/duplicate_op.png b/doc/design/images/duplicate_op.png
similarity index 100%
rename from paddle/framework/images/duplicate_op.png
rename to doc/design/images/duplicate_op.png
diff --git a/paddle/framework/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op2.graffle
rename to doc/design/images/duplicate_op2.graffle
diff --git a/paddle/framework/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png
similarity index 100%
rename from paddle/framework/images/duplicate_op2.png
rename to doc/design/images/duplicate_op2.png
diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/design/images/profiler.png differ
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
index 202b4b65103c0b7c536a9cb466c4120ce134d8c3..691081c268b848811bf5ee6d6a41edfe0f47eec0 100644
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
     def minimize(self, loss, parameter_list):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
         params_grads = self.create_backward_pass(loss, parameter_list)
diff --git a/doc/design/profiler.md b/doc/design/profiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20
--- /dev/null
+++ b/doc/design/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="./images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range. 
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+
+```c++
+enum ProfilerState {
+  kDisabled, 
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index fa1b6a372728ccac128d2e6e79a6514b8884ea3f..bae42593ddc6f7a7eb47d603752ad6efa9820b45 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -15,7 +15,7 @@
 获取PaddlePaddle的Docker镜像
 ------------------------------
 
-执行下面的命令获取最新的PaddlePaddle Docker镜像
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
 
   .. code-block:: bash
 
@@ -27,7 +27,7 @@
 
      docker pull docker.paddlepaddle.org/paddle
 
-下载GPU版本的Docker镜像：
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
 
   .. code-block:: bash
 
@@ -54,7 +54,7 @@
 .. _docker_run:
 
 在Docker中执行PaddlePaddle训练程序
-------------------------------
+----------------------------------
 
 假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
 `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
@@ -82,7 +82,7 @@
 .. _docker_run_book:
 
 使用Docker启动PaddlePaddle Book教程
-------------------------------
+-----------------------------------
 
 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 06012bf65e75c32957516f6b7f62e09480871b84..56a7c68e4d39c45249fa55a964dc48b7081596a6 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
 Pull PaddlePaddle Docker Image
 ------------------------------
 
-Run the following command to download the latest Docker images:
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
 
   .. code-block:: bash
 
@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
 
      docker pull docker.paddlepaddle.org/paddle
 
-Download GPU version images:
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
 
   .. code-block:: bash
 
@@ -58,7 +58,7 @@ and run:
 .. _docker_run:
 
 Launch your training program in Docker
-------------------------------
+--------------------------------------
 
 Assume that you have already written a PaddlePaddle program
 named :code:`train.py` under directory :code:`/home/work` (refer to 
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index a4587f82a984acf243f49834e707fcd66d5b1252..0c741e936b46eda5e7165e4ee54b545b14a28a19 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
 ------------------------------
 
 
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
 
-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 55e31560a0f5087ab69966a6281c6c8573c04204..285ed09805b09790beaef014f6813c227aff33ac 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -12,14 +12,14 @@ Install Using pip
 ------------------------------
 
 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements.
+machine, it will also download requirements, the version is cpu_avx_openblas.
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
 
-If you wish to install GPU version, just run:
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
 
   .. code-block:: bash
 
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index a9087be6f350c5656cabb0c64ba0f200d1c666cc..9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,13 +7,13 @@
 ++++++++
 
 PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装：
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index d14e3f5c0cc90792fce9cb82e65da482c44dc433..063d9d880c82550f7f5d47d3d0b1fff59865bca7 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -8,13 +8,13 @@ Quick Install
 
 You can use pip to install PaddlePaddle with a single command, supports
 CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install:
+Simply run the following command to install, the version is cpu_avx_openblas:
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
-If you need to install GPU version, run:
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
 
   .. code-block:: bash
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 7436e8c228db2caeb1421f8d78ddcf55f00deee4..b4458eb9551724021636b628c5bf8c96f6e659aa 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+if (WITH_GPU)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+else()
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+endif ()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+if (WITH_GPU)
+  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
+else()
+  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
@@ -21,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
@@ -37,7 +45,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
deleted file mode 100644
index ac60be572419b62f4beb644ff192d413c35e19bb..0000000000000000000000000000000000000000
--- a/paddle/framework/backward.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Operator/expression 's Backward
-
-## Motivation
-
-In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass. 
-
-## Implementation
-
-In this design doc, we exported only one API for generating the backward pass.
-
-```c++
-std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-```
-
-The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
-
-### Backward Operator Registry
-
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
-
-|                        | forward operator | backward operator 
-| ---------------------- | ---------------- |------------------------- |		
-| **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
-| **Operator::outputs_** | Outputs          | InputGradients            |
-
- In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
-
-For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
-
-```cpp
-REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
-```
-
-`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
-
-`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
-
-### Backward Opeartor Creating
-
-Given a certain forward operator, we can get its corresponding backward operator by calling:
-
-```cpp
-OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-```
-
-The function `BuildGradOp` will sequentially execute following processes:
-
-1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-
-2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
-
-3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
-
-4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
-
-### Backward Network Building
-
-A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
-
-1. Op 
-
-   When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
-
-2. NetOp 
-
-   In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
-
-3. RnnOp
-
-   RnnOp is a nested stepnet operator.  Backward module needs to recusively call `Backward` for every stepnet.
-
-4. Sharing Variables
-
-   As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable. 
-
-<p align="center">
-<img src="./images/duplicate_op.png" width="50%" ><br/>
-
-​	Figure 1. Sharing variables in operators. 
-
-</p>
-
-​	Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting. 
-
-<p align="center">
-<img src="images/duplicate_op2.png" width="40%" ><br/>
-
-​	Figure 2. Replace sharing variable's gradient with `Add` operator.
-
-</p>
-
-​	Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction. 
-
-5. Part of the Gradient is Zero.
-
-   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
-
-
-Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index 35f16025a9ae44bd70e15b19b25deb08299bea88..ac6e40a3ae8f6655eecc8279543d6afe6bbb1399 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <functional>
 
 #include "paddle/framework/data_transform.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
@@ -22,5 +25,111 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
   return data_transform_map;
 }
 
+auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNCHW, LibraryType::kPlain);
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+                 "TransDataLayout only support Datatype are same!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  auto place = kernel_pair.second.place_;
+  CopyFrom(src, place, *ctx, dst);
+
+  auto src_dim = src.dims();
+  std::vector<int64_t> dst_dim;
+
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  dst->Resize(make_ddim(dst_dim));
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
 }  // namespace framework
 }  // namespace paddle
+
+namespace f = paddle::framework;
+
+namespace {
+std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
+std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
+}
+
+REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
+                           std::bind(f::TransDataLayout, NHWC2NCHW,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
+REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
+                           std::bind(f::TransDataLayout, NCHW2NHWC,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     std::placeholders::_4));
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
index 73f894a3e20ab779f8607e63a67139b0e8cce79a..56ebc80f4386958608213f30e745f2d9528e9e5e 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -21,17 +21,20 @@ limitations under the License. */
 #include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"
 
 namespace paddle {
 namespace framework {
 
-using DataTransformFN =
-    std::function<void(const std::vector<platform::DeviceContext*> ctx,
-                       const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
+using DataTransformFn =
+    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
+                       const Variable&, Variable*)>;
+
 struct KernelTypePairHash {
   static void HashCombine(const OpKernelType& t, std::size_t* seed) {
     OpKernelType::Hash kernel_type_hasher;
@@ -46,8 +49,69 @@ struct KernelTypePairHash {
   }
 };
 
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
 using DataTransformMap =
-    std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>;
+    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
 
 class DataTransformFnMap {
  public:
@@ -58,25 +122,25 @@ class DataTransformFnMap {
   }
 
   void Insert(const OpKernelType& left, const OpKernelType& right,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
     Insert(std::make_pair(left, right), data_tranform_fn);
   }
 
   void Insert(const KernelTypePair& kernel_type_pair,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
     PADDLE_ENFORCE(!Has(kernel_type_pair),
                    "KernelTypePair %s has been registered", "");
     map_.insert({kernel_type_pair, data_tranform_fn});
   }
 
-  const DataTransformFN& Get(const KernelTypePair& key_pair) const {
+  const DataTransformFn& Get(const KernelTypePair& key_pair) const {
     auto data_transformer = GetNullable(key_pair);
     PADDLE_ENFORCE_NOT_NULL(data_transformer,
-                            "DataTransformFN should not be NULL");
+                            "DataTransformFn should not be NULL");
     return *data_transformer;
   }
 
-  const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const {
+  const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
     auto it = map_.find(key_pair);
     if (it == map_.end()) {
       return nullptr;
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
index f93a47eeb567c4fc984954aa5198362c9939c556..edd305fd17ae202926b83fbec10089719baa2e16 100644
--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -11,36 +11,67 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <array>
+#include <vector>
 
-#include "paddle/framework/data_transform.h"
 #include <gtest/gtest.h>
 
+#include "paddle/framework/data_transform.h"
+#include "paddle/platform/device_context.h"
+
 namespace paddle {
 namespace framework {
-
 using namespace platform;
 
+/**
+ * @brief cross validation of different kernel type transform
+ *  We use four bit map represent different combination.
+ *  If the field has multiple possible value, only choose two of them.
+ *  For DataType, only test the FP32(float), FP64(double).
+ *  e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
+ *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
+ */
+
+std::array<proto::DataType, 2> kDataType = {
+    {proto::DataType::FP32, proto::DataType::FP64}};
+
+std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+
+std::array<DataLayout, 2> kDataLayout = {{
+    DataLayout::kNHWC, DataLayout::kNCHW,
+}};
+
+std::array<LibraryType, 2> kLibraryType = {{
+    LibraryType::kPlain, LibraryType::kMKLDNN,
+}};
+
+OpKernelType GenFromBit(const std::vector<bool> bits) {
+  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
+                      kLibraryType[bits[3]]);
+}
+
 int test_value = 0;
 
-OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW,
-                           LibraryType::kCUDNN);
-OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0),
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
-OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
+auto kernel0 = GenFromBit({0, 0, 0, 0});
+auto kernel1 = GenFromBit({0, 0, 0, 1});
+auto kernel2 = GenFromBit({0, 0, 1, 0});
+auto kernel3 = GenFromBit({0, 0, 1, 1});
 
-void type1_to_type2(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransDataType_t(const platform::DeviceContext* ctx,
+                     const KernelTypePair& p, const Variable& in,
+                     Variable* out) {
   test_value++;
 }
 
-void type2_to_type3(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransDataLayout_t(const platform::DeviceContext* ctx,
+                       const KernelTypePair& p, const Variable& in,
+                       Variable* out) {
   test_value--;
 }
 
-void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
-                    const Variable& in, Variable* out) {
+void TransLibraryType_t(const platform::DeviceContext* ctx,
+                        const KernelTypePair& p, const Variable& in,
+                        Variable* out) {
   test_value += 2;
 }
 
@@ -49,30 +80,89 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
 
 namespace frw = paddle::framework;
 
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2,
-                           frw::type1_to_type2);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3,
-                           frw::type2_to_type3);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
-                           frw::type1_to_type3);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
 
 TEST(DataTransform, Register) {
   using namespace paddle::framework;
   using namespace paddle::platform;
 
   auto& instance = DataTransformFnMap::Instance();
-  ASSERT_EQ(instance.Map().size(), 3UL);
-  std::vector<DeviceContext*> ctx;
   paddle::framework::Variable in;
   paddle::framework::Variable out;
 
-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in,
-                                                                       &out);
+  DeviceContext* ctx = new CPUDeviceContext();
+  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
+  instance.Get(pair0)(ctx, pair0, in, &out);
   ASSERT_EQ(test_value, 1);
-  instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+
+  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
+  instance.Get(pair1)(ctx, pair1, in, &out);
   ASSERT_EQ(test_value, 0);
-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+
+  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
+  instance.Get(pair3)(ctx, pair3, in, &out);
   ASSERT_EQ(test_value, 2);
 }
+
+TEST(DataTransform, DataLayout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
+  src->set_layout(DataLayout::kNHWC);
+
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 1, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+
+  Tensor dst = out.Get<Tensor>();
+
+  EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 1, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, out, &in);
+  }
+
+  EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
+}
+
+TEST(DataTransform, DataType) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  {
+    auto kernel1 = GenFromBit({0, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.data<double>() != nullptr);
+}
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 997773c1689efad4ce5a86c09ce58bd3a40185e0..bf1f0471ccbfccf13cb6f74c8088da7acd68ec0b 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 
 #include "paddle/framework/executor.h"
 
-#include <algorithm>
-#include <iostream>
-#include <memory>
 #include <set>
-#include <vector>
 
+#include "gflags/gflags.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/scope.h"
+
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
   }
 }
 
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   // TODO(tonyyang-svail):
@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
     VLOG(3) << op->DebugString();
     op->Run(*local_scope, place_);
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        }
+      }
+    }
   }
-  if (create_local_scope) {
+  if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
 }
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index d6601090d5b6150a5aa467210038d3693c3e67a8..682cff168d4d31e0565fc987604f97a671566fbd 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
     places.emplace_back(platform::CPUPlace());
     LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
   }
-  platform::DeviceContextPool::Create(places);
+  platform::DeviceContextPool::Init(places);
   return true;
 }
 
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
index 6baae6c2bb80a4f631cad89231ce4fcb8d94ed86..7707799cae8c4edc304cd81725270a85f01fd28d 100644
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@@ -20,7 +20,11 @@ namespace framework {
 // For more details about the design of LibraryType, Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
 
-enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 };
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
 
 inline std::string LibraryTypeToString(const LibraryType& library_type) {
   switch (library_type) {
@@ -31,7 +35,26 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
     case LibraryType::kCUDNN:
       return "CUDNN";
     default:
-      PADDLE_THROW("unknown LibraryType %d", library_type);
+      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
+  }
+}
+
+inline LibraryType StringToLibraryType(const char* ctype) {
+  std::string s(ctype);
+  if (s == std::string("PLAIN")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("MKLDNN")) {
+    return LibraryType::kMKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return LibraryType::kCUDNN;
+    // To be compatible with register macro.
+    // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("CPU")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("CUDA")) {
+    return LibraryType::kPlain;
+  } else {
+    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
   }
 }
 
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index f8a3be9a82bdbaf82550634d36122eb7bbe85e54..7b6dc09bdb5535488c8c4dbc71c9cd6a7998bd0b 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
 
 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                        const platform::DeviceContext &dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
-  {  // the 1st field, uint32_t version
+  {  // the 1st field, uint32_t version for LoDTensor
     constexpr uint32_t version = 0;
     os.write(reinterpret_cast<const char *>(&version), sizeof(version));
   }
-  {  // the 2nd field, tensor description
-     // int32_t  size
-     // void*    protobuf message
-    proto::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
-    auto dims = framework::vectorize(tensor.dims());
-    auto *pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto *data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto &gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void *>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char *>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  {  // the 4th field, lod information
-     // uint64_t lod_level
-     // uint64_t lod_level_1 size in byte.
-     // int*     lod_level_1 data
-     // ...
+  {
+    // the 2st field, LoD information
+    // uint64_t lod_level
+    // uint64_t lod_level_1 size in byte.
+    // int*     lod_level_1 data
+    // ...
     auto lod = tensor.lod();
     uint64_t size = lod.size();
     os.write(reinterpret_cast<const char *>(&size), sizeof(size));
@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                static_cast<std::streamsize>(size));
     }
   }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::TensorDesc desc;
-  {  // int32_t size
-     // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char *>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-
-    void *buf;
-    platform::Place cpu = platform::CPUPlace();
-    switch (desc.data_type()) {
-      case proto::FP32:
-        buf = tensor->mutable_data<float>(cpu);
-        break;
-      case proto::FP64:
-        buf = tensor->mutable_data<double>(cpu);
-        break;
-      case proto::INT32:
-        buf = tensor->mutable_data<int>(cpu);
-        break;
-      case proto::INT64:
-        buf = tensor->mutable_data<int64_t>(cpu);
-        break;
-      default:
-        PADDLE_THROW("DataType %d not supported", desc.data_type());
-    }
-    is.read(static_cast<char *>(buf), tensor->memory_size());
-  }
-  {  // read lod
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, LoD information
     uint64_t lod_level;
     is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
     auto &lod = *tensor->mutable_lod();
@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
       lod[i] = tmp;
     }
   }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor));
 }
 
 }  // namespace framework
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 02d84b68233f2fdfc66e1df2fc7ce20307cadd94..0747c8db531d6ae443d76591b945cce0c9bbea2b 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
   EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
 }
 
+TEST_F(LoDTensorTester, SerializeAndDeserialize) {
+  LoDTensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  std::ostringstream oss;
+  SerializeToStream(oss, lod_tensor_, cpu_ctx);
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+  float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < kLodTensorSize; ++i) {
+    EXPECT_EQ(dst_ptr[i], i);
+  }
+  EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
+}
+
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index b361e64438251c1df827667fb825e7f5909fb09e..3e58e6442edfe006c8aed238f67b9524783601ee 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
   need_update_ = true;
 }
 
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
 OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
     : desc_(desc), need_update_(false) {
   // restore inputs_
@@ -252,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(int v) const { attr_->set_i(v); }
   void operator()(float v) const { attr_->set_f(v); }
   void operator()(const std::string &v) const { attr_->set_s(v); }
-  void operator()(bool b) const { attr_->set_b(b); }
+
+  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
+  template <class T,
+            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  void operator()(T b) const {
+    attr_->set_b(b);
+  }
 
   void operator()(const std::vector<int> &v) const {
     VectorToRepeated(v, attr_->mutable_ints());
@@ -266,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-  void operator()(proto::BlockDesc *desc) const {
-    attr_->set_block_idx(desc->idx());
-  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 93d4a88f3c390551ab41e42ec2f6f30f52e306db..4cf784a0d0d319d09caa27b4e2b589bd7ac4f324 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -35,6 +35,8 @@ class OpDesc {
 
   OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
 
+  void CopyFrom(const OpDesc &op_desc);
+
   proto::OpDesc *Proto();
 
   std::string Type() const { return desc_.type(); }
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
index 97b542e345feab0bab701dd967558ce23375dc7f..b06002096fb109da806809f7b908d9768cf095ba 100644
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -68,6 +68,8 @@ struct OpKernelType {
            data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
            library_type_ == o.library_type_;
   }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
 };
 
 inline std::ostream& operator<<(std::ostream& os,
@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
   return os;
 }
 
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
index 8753d7cc378662ce116e447dc6a340a07e5dd2ca..649afeee8a846b0579545f2edff77e9dbe3b4dd8 100644
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
   OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                               LibraryType::kCUDNN);
 
-  std::ostringstream stream;
-  stream << op_kernel_type;
   ASSERT_EQ(
-      stream.str(),
+      paddle::framework::KernelTypeToString(op_kernel_type),
       "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
 }
 
@@ -48,4 +46,4 @@ TEST(OpKernelType, Hash) {
 
   OpKernelType::Hash hasher;
   ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 9bb2a3b5c2931d03152cc3262c0ad8da17b8aacb..d75c0233e8e0134ddf4edc50c07490a234b65cd0 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -37,8 +37,8 @@ class Registrar {
  public:
   // In our design, various kinds of classes, e.g., operators and kernels,
   // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
   // be removed from the generated binary file by the linker. To avoid such
   // removal, we add Touch to all registrar classes and make USE_OP macros to
   // call this method. So, as long as the callee code calls USE_OP, the global
@@ -79,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
       typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
 
-  void operator()(const char* op_type) const {
+  void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type) const {}
+  void operator()(const char* op_type, const char* library_type) const {}
 };
 
 // User can register many kernel in one place. The data type could be different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
-  explicit OpKernelRegistrar(const char* op_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
     OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type);
+    func(op_type, library_type);
   }
 };
 
@@ -181,7 +182,8 @@ class OpKernelRegistrar : public Registrar {
       __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");           \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type,       \
+                                                          #DEVICE_TYPE);  \
   int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
     __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
     return 0;                                                             \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 4cdf6e0865e0922b72bd184172f85a9c705dcd00..cef530c6e639f6e2188869fa57d114ec6b885aa8 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
@@ -182,3 +196,71 @@ TEST(OperatorRegistrar, Test) {
   using namespace paddle::framework;
   OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
 }
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetActualKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+#endif
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 886f73e7b81c35cac573bd041e6462eb2111bf85..fc7091f1c89f8b3f998f6d1b68f032b76bad2197 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
+const platform::DeviceContext* GetDeviceContext(
+    framework::KernelTypePair& kernel_pair) {
+  auto& actual_kernel_key = kernel_pair.first;
+  auto& expected_kernel_key = kernel_pair.second;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(actual_kernel_key.place_) &&
+      platform::is_cpu_place(expected_kernel_key.place_)) {
+    return pool.Get(actual_kernel_key.place_);
+  } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
+             platform::is_gpu_place(expected_kernel_key.place_)) {
+    return pool.Get(expected_kernel_key.place_);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
 void OperatorWithKernel::Run(const Scope& scope,
                              const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto dev_ctx = pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
 
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
   }
 
   if (actual_kernel_key == expected_kernel_key) {
-    kernel_iter->second->Compute(ctx);
+    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
+                      "Currently, model parallelism is only supported between "
+                      "CPU and other devices. For example, multi-GPU model "
+                      "parallelism will failed.");
   } else {
-    Scope& op_scope = scope.NewScope();
-    auto input_vars = this->InputVars();
-    for (auto var_name : input_vars) {
-      op_scope.Var(var_name);
-    }
-
-    // TODO(qijun) get appropriate DeviceContext from DeviceContext pool
-    platform::DeviceContext* trans_dev_ctx = nullptr;
-    std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx};
+    auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
+    const DataTransformFn* trans_fun =
+        DataTransformFnMap::Instance().GetNullable(kernel_pair);
+    if (trans_fun) {
+      auto input_vars = this->InputVars();
+      // TODO(qijun) filter the input vars that do not need to be transformed
+
+      // filter vars that has been transformed
+      std::vector<std::string> need_trans;
+      for (auto var_name : input_vars) {
+        auto var_name_trans =
+            var_name + framework::KernelTypeToString(expected_kernel_key);
+        if (!scope.FindVar(var_name_trans)) {
+          const_cast<Scope&>(scope).Var(var_name_trans);
+          need_trans.push_back(var_name);
+        }
+      }
 
-    // TODO(qijun) get appropriate DataTransformFN from global map
-    framework::DataTransformFN trans_fun = nullptr;
+      if (!need_trans.empty()) {
+        auto trans_dev_ctx = GetDeviceContext(kernel_pair);
 
-    // Wait for transform starting
-    dev_ctx->Wait();
+        // Wait for transform starting
+        dev_ctx->Wait();
 
-    for (auto var_name : input_vars) {
-      trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)),
-                op_scope.FindVar(var_name));
-    }
-    // Wait for data transform finishing
-    for (auto ctx : trans_dev_ctx_vec) {
-      ctx->Wait();
+        for (auto var_name : need_trans) {
+          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
+                       scope.FindVar(var_name + framework::KernelTypeToString(
+                                                    expected_kernel_key)));
+        }
+        // Wait for data transform finishing
+        trans_dev_ctx->Wait();
+      }
     }
-
-    // Create a new ExecutionContext
-    ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
-    kernel_iter->second->Compute(op_ctx);
   }
+
+  kernel_iter->second->Compute(ctx);
 }
 
 OpKernelType OperatorWithKernel::GetActualKernelType(
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
index c74459c9dd7006a24615b1d6df041583088fb25c..82adfa7123a3cf40d929021602c45fe7d2e34ffa 100644
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -12,5 +12,58 @@ limitations under the License. */
 #include "paddle/framework/selected_rows.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
+  auto tensor = *selected_rows->mutable_value();
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, &tensor);
+}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
index 0332b91323e3a4b4b80e02302ad3dcafe0986cde..699e392688e9889f050592172f8bfc45f855d0b1 100644
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -59,5 +59,14 @@ class SelectedRows {
   int64_t height_;
 };
 
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
index 4ee13a65d72e44693573397bb686b355effb2227..75487c4010391aa9e519d73058184fa936dabb84 100644
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
 }
 
+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index b9f6884f7c4b6eeacda722762d485dea97fdcb63..341a6949beeb2dfa64b23d2079bd8f48750a94f8 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -178,7 +178,7 @@ class Tensor {
   DDim dims_;
 
   /**
-   * @brief the layout of memory block, default is NCHW.
+   * @brief the layout of memory block, default is NHWC.
    *
    * @note the memory allocation order, describe how weight/data is stored
    *       For example, in 4-D Tensor(rank=4), there are three commonly
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index ca76a9fcb9079bab22f7b192c45903852c91797f..a1b4a03289eca4c8b9d8c23ede4221853cb31f79 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,12 +15,13 @@
 #include <gtest/gtest.h>
 #include <string>
 
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
 TEST(Tensor, Dims) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor tt;
+  framework::Tensor tt;
   tt.Resize({2, 3, 4});
-  DDim dims = tt.dims();
+  framework::DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(i + 2, dims[i]);
@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
 }
 
 TEST(Tensor, DataAssert) {
-  paddle::framework::Tensor src_tensor;
+  framework::Tensor src_tensor;
 
   bool caught = false;
   try {
     src_tensor.data<double>();
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
         "holder_ should not be null\nTensor holds no memory. Call "
@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
    because Memory::Alloc() and Memory::Free() have not been ready.
 */
 TEST(Tensor, MutableData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
     EXPECT_EQ(p1, p2);
   }
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
     EXPECT_EQ(p1, p2);
   }
 #endif
 }
 
 TEST(Tensor, ShareDataWith) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
-    Tensor dst_tensor;
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
     }
     ASSERT_TRUE(caught);
 
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
 }
 
 TEST(Tensor, Slice) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src_tensor;
-    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
+                                 platform::CPUPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
     EXPECT_EQ(slice_dims[1], 3);
@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
     uintptr_t src_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.data<int>());
     uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {
 
 #ifdef PADDLE_WITH_CUDA
   {
-    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::CUDAPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
     EXPECT_EQ(slice_dims[1], 9);
 
     uintptr_t src_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
 }
 
 TEST(Tensor, ReshapeToMatrix) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
+  framework::Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
   for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
     src_ptr[i] = i;
   }
-  Tensor res = ReshapeToMatrix(src, 2);
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
 
 TEST(Tensor, Layout) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  Tensor src;
-  ASSERT_EQ(src.layout(), DataLayout::kNHWC);
-  src.set_layout(DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
+  framework::Tensor src;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  src.set_layout(framework::DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7efc649d0bcda67c663d148e83bcbb6789b0f371
--- /dev/null
+++ b/paddle/framework/tensor_util.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    CopyFrom(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b00e6e59d93328bf3142597ea4de0dc225501e56
--- /dev/null
+++ b/paddle/framework/tensor_util.cu
@@ -0,0 +1 @@
+./tensor_util.cc
\ No newline at end of file
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index ea4e4f22ea82ccc9f8b683d2fd773a7bc37f78a3..6a21f8db1e3966fd23eee0da2346b2d61f9321fb 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
                src_ptr, size);
 }
 
+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void* buf;
+    platform::Place cpu = platform::CPUPlace();
+    // TODO(Yancey1989): use VisiterDataType instead of DataType switch
+    switch (desc.data_type()) {
+      case proto::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case proto::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case proto::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case proto::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char*>(buf), tensor->memory_size());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
index f388c19f28ed28335818733f946d8eaf18464627..0dc5166fcabf77b48b8681ab1f050e2bc88f44ab 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -13,6 +13,7 @@
 
 #include "paddle/framework/tensor_util.h"
 #include <gtest/gtest.h>
+#include <cmath>
 #include <string>
 
 namespace paddle {
@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
 #endif
 }
 
+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+
+  ASSERT_TRUE(HasNAN(src));
+}
+
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+
+    delete gpu_place;
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ebd35fdf6c2a1388fec23057070f723c8ef9da9c
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 5f6b2d458f7ee764c22d203f285b78023b6012f3..bcd8190755083ec30687675602a1c95a9c15c69e 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <condition_variable>
 #include <functional>
+#include <future>
 #include <mutex>
 #include <queue>
 #include <thread>
@@ -25,10 +26,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-typedef std::function<void()> Task;
-
 class ThreadPool {
  public:
+  typedef std::packaged_task<void()> Task;
+  typedef std::function<void()> Fun;
+
   /**
    * @brief   Get a instance of threadpool, the thread number will
    *          be specified as the number of hardware thread contexts
@@ -61,13 +63,18 @@ class ThreadPool {
   /**
    * @brief   Push a function to the queue, and will be scheduled and
    *          executed if a thread is available.
-   * @param[in] Task  will be pushed to the task queue.
+   * @param[in] Task, will be pushed to the task queue.
+   * @return    std::future<void>, we could wait for the task finished by
+   *            f.wait().
    */
-  void Run(const Task& fn) {
+  std::future<void> Run(const Fun& fn) {
     std::unique_lock<std::mutex> lock(mutex_);
-    tasks_.push(fn);
+    Task task(std::bind(fn));
+    std::future<void> f = task.get_future();
+    tasks_.push(std::move(task));
     lock.unlock();
     scheduled_.notify_one();
+    return f;
   }
 
   /**
@@ -110,7 +117,7 @@ class ThreadPool {
         break;
       }
       // pop a task from the task queue
-      auto task = tasks_.front();
+      auto task = std::move(tasks_.front());
       tasks_.pop();
 
       --available_;
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
index 012d92a5edc415f0bb2f8a0ea38ffeb9549d54fa..50b6238cd8786be9d8cf2d5f821daadea12bd208 100644
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -20,16 +20,21 @@ limitations under the License. */
 namespace framework = paddle::framework;
 
 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    pool->Run([&sum]() { sum.fetch_add(1); });
+    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
+    fs.push_back(std::move(f));
+  }
+  for (auto& f : fs) {
+    f.wait();
   }
 }
 
 TEST(ThreadPool, ConcurrentInit) {
   framework::ThreadPool* pool;
-  int concurrent_cnt = 50;
+  int n = 50;
   std::vector<std::thread> threads;
-  for (int i = 0; i < concurrent_cnt; ++i) {
+  for (int i = 0; i < n; ++i) {
     std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
     threads.push_back(std::move(t));
   }
@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
   }
 }
 
-TEST(ThreadPool, ConcurrentStart) {
+TEST(ThreadPool, ConcurrentRun) {
   framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
   std::atomic<int> sum(0);
   std::vector<std::thread> threads;
-  int concurrent_cnt = 50;
+  int n = 50;
   // sum = (n * (n + 1)) / 2
-  for (int i = 1; i <= concurrent_cnt; ++i) {
+  for (int i = 1; i <= n; ++i) {
     std::thread t(do_sum, pool, std::ref(sum), i);
     threads.push_back(std::move(t));
   }
@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
     t.join();
   }
   pool->Wait();
-  EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
+  EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index bd8973eeb369aabd2c52d4fccf799657c564ee78..7d002b9ea0b597730685ee03b021c4982f787f49 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
   }
 }
 
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index de7b70e271b38ebe3a4c38704d0cced47d010788..cbdbf5335d32d55a0221728758025c9d2cb3e7d1 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,14 +126,165 @@ public:
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
+  }
+};
+
 #ifdef PADDLE_MOBILE_INFERENCE
-    if (Device == DEVICE_TYPE_CPU) {
-      memory_.reset();
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
     }
-#endif
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+
+              // gemm
+              int M = outputChannels / groups_;
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+
+    memory_.reset();
   }
 };
 
+#endif
+
 /*
  * \brief Backward input calculation of convolution.
  */
@@ -348,7 +499,11 @@ public:
   }
 };
 
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 0c37fc972484bfbede01d23652e384071bf883af..36a9bcf84e4b14965c83627821b71d1c7c0da1b2 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,54 @@ public:
                   int dilationWidth = 1);
 };
 
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 1f085538d81904dbd5b5d6bcd014adaed22e37d7..3ba866dcdd845403d52f7a85adfef08cbb11c305 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
 #endif
 
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          0,
+                          height,
+                          0,
+                          width);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
 }  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 41ead3c5ecef248830cfb0f8be360f21dcd58e7b..3d6ced713f00bd72622d8aeed3967642b6774ffe 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -34,6 +34,16 @@ else()
     message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
 endif()
 
+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
index 741984bb68d3881f6ac26eaca7790190ed6e572a..ac217f1363dbd0360645bbe07cd71a17cc931a79 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
   }
 
   /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1UL);
+  CHECK_EQ(config_.inputs_size(), 1);
   const NormConfig& conf = config_.inputs(0).norm_conf();
   localSize_ = conf.size();
   alpha_ = conf.scale();
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd75555fae134664d92ba9f8ffdea8af78166b7e
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLPackedRecurrentLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
+
+bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
+  }
+  return true;
+}
+
+void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batchValue = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr preBatchValue =
+            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
+
+        packed_weight_->gemm_compute(preBatchValue, batchValue);
+      }
+      Argument arg;
+      arg.value = batchValue;
+      activation_->forward(arg).check();
+    }
+  }
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
+                                            size_t numSequences,
+                                            const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr batchValue =
+          batchValue_->getBatchValue(n, batchGrad->getHeight());
+
+      Argument arg;
+      arg.value = batchValue;
+      arg.grad = batchGrad;
+      activation_->backward(arg).check();
+
+      if (n != 0) {
+        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
+        packed_weightT_->gemm_compute(batchGrad, batchValue);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batchValue =
+              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
+          weight_->getWGrad()->mul(
+              *batchValue->getTranspose(), *batchGrad, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      weight_->getWGrad()->mul(
+          *output_.value
+               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
+               ->getTranspose(),
+          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
+                                   len - 1),
+          1,
+          1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bded523a8fbd6ff18f28859bd2a1bf3c1a25e2a0
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
+ * More details:
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
+ */
+
+class MKLPackedRecurrentLayer : public RecurrentLayer {
+public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
+
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
+
+protected:
+  /// packed_weight_ contains same data with
+  /// RecurrentLayer::weight_ but is packed
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d5093beb43e2f086601c2616ace033da34f341
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index e4c2b483d2fa4032735858dab17647592791a9c7..6bd42c06cadf755e8703f3fc299d0e6248dd1478 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
+#include "RecurrentLayer.h"
 
 DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
 REGISTER_LAYER(recurrent, RecurrentLayer);
 
 bool RecurrentLayer::init(const LayerMap& layerMap,
@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
     bias_->getWGrad()->collectBias(*output_.grad, 1);
     bias_->getParameterPtr()->incUpdate(callback);
   }
-
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f40dbe150fa93becfc26f6ea9e55e40eaf208860
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a2f07937b8834e3f3fa7a6bf2ae10f29a8d84f29..ba83667ebc9a89c37f77a7f71e6df90b54723cc0 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
     }
   }
 }
@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
     }
   }
   for (auto useGpu : {true}) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 16ab0e6aecb6a895b20389992a44dc542eb3b00a..0e130843339a1030f86f4d48891499bac20e9ca2 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
 #include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) {
   }
 }
 
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 int main(int argc, char** argv) {
-  if (version::isWithGpu()) {
-    testing::InitGoogleTest(&argc, argv);
-    initMain(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
   }
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 5aaaf993323c2d4dbef688d0977ec6374fde6512..77b52eb1760c7b79aa47efb447a4c99b6ab5e027 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
@@ -48,6 +49,10 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -56,106 +61,28 @@ function(op_library TARGET)
                 ${op_common_deps})
     endif()
 
-    # net_op doesn't need pybind
-    if ("${TARGET}" STREQUAL "net_op")
-        set(pybind_flag 1)
-    endif()
-
-    if ("${TARGET}" STREQUAL "compare_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
-    endif()
-
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
-
-    # conv_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
-    endif()
-
-    # pool_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
-    endif()
-
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
-    endif()
-
-    if ("${TARGET}" STREQUAL "logical_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
-    endif()
-
-    # pool_with_index_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_with_index_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
-    endif()
-
-    # conv_transpose_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_transpose_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
-    endif()
-
-    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
-    endif()
-
-    # save_restore_op contains several operators
-    if ("${TARGET}" STREQUAL "save_restore_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
-    endif()
-
-    # activation_op contains several operators
-    if ("${TARGET}" STREQUAL "activation_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
-    endif()
-
-    # nccl_op contains several operators
-    if ("${TARGET}" STREQUAL "nccl_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
-    endif()
-
-    # reduce_op contains several operators
-    if ("${TARGET}" STREQUAL "reduce_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
-    endif()
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
-    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
-        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
     # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -166,7 +93,6 @@ function(op_library TARGET)
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-
     if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
@@ -181,58 +107,31 @@ endfunction()
 add_subdirectory(math)
 add_subdirectory(nccl)
 
-set(DEPS_OPS
-    cond_op
-    cross_entropy_op
-    recurrent_op
-    softmax_with_cross_entropy_op
-    softmax_op
-    sequence_softmax_op
-    sum_op
-    pool_op
-    maxout_op
-    unpool_op
-    pool_with_index_op
-    conv_op
-    conv_transpose_op
-    nccl_op
-    sequence_conv_op
-    sequence_pool_op
-    lod_rank_table_op
-    lod_tensor_to_array_op
-    array_to_lod_tensor_op
-    max_sequence_len_op
-    lstm_op
-    tensor_array_read_write_op
-    gru_op
-    adagrad_op
-    sgd_op
-    save_op
-    load_op
-    send_op
-    recv_op)
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()
 
 if(WITH_DISTRIBUTE)
-add_subdirectory(detail)
-op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    send_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
-    recv_op.cc
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
 endif()
 
-op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
+op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
@@ -242,21 +141,17 @@ op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
-op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
-op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
-if(WITH_GPU)
-op_library(nccl_op DEPS nccl_common)
-endif()
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
-
+op_library(recurrent_op DEPS executor)
+op_library(cos_sim_op DEPS cos_sim_functor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
@@ -265,9 +160,10 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 
-set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
@@ -276,6 +172,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 if(WITH_GPU)
-  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index 052c793a01907abdc7784d1290f43543ae81bdb1..c83318a272302a474c37ce86365201acf56b9cad 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -105,48 +105,18 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
     auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    for (size_t i = 0; i < grad_rows.size(); i++) {
-      size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
-      for (int64_t j = 0; j < grad_width; j++) {
-        grad_merge_data[grad_merge_i * grad_width + j] +=
-            grad_data[i * grad_width + j];
-      }
-    }
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
 
     // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.eigen_device()) = gm * gm;
+    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
 
     math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
-    functor(context, *grad_square, moment);
+    functor(context, grad_square, moment);
 
     // 3. update parameter
     auto* lr = learning_rate.data<T>();
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 75bc7affd6c78beb783e01682b4538f2c259df26..4e579387924a5b0499f29609bc6b1322030a3c0d 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -78,62 +78,30 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
     // 1. g_m.rows = set(g.rows)
-    auto grad_rows = grad.rows();
-    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
-
     auto grad_width = grad.value().dims()[1];
-    std::unique_ptr<framework::SelectedRows> grad_merge{
-        new framework::SelectedRows()};
-    grad_merge->set_rows(merge_rows);
-    grad_merge->set_height(grad.height());
-    grad_merge->mutable_value()->mutable_data<T>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), grad_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, grad_merge->mutable_value(), 0.0);
-
-    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
-    auto* grad_data = grad.value().data<T>();
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid1(1, grad_rows.size());
-
-    MergeGradKernel<
-        T, 256><<<grid1, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_data, grad.rows().data(),
-                                   grad_merge_data, grad_merge->rows().data(),
-                                   grad_merge->rows().size(), grad_width);
-
+    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+    auto& merge_rows = grad_merge.rows();
     // 2. m += g_m * g_m
-    std::unique_ptr<framework::SelectedRows> grad_square{
-        new framework::SelectedRows()};
-    grad_square->set_rows(grad_merge->rows());
-    grad_square->set_height(grad_merge->height());
-    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
-                                                  context.GetPlace());
-    auto gs =
-        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
-    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.eigen_device()) = gm * gm;
+    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
 
     math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
-    functor(context, *grad_square, moment);
+    functor(context, grad_square, moment);
 
     // 3. update parameter
     auto* lr = learning_rate.data<T>();
     auto* param_data = param->data<T>();
     auto* moment_data = moment->data<T>();
 
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge->rows().data(),
+                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
                                    lr, param_data, moment_data, grad_width,
                                    epsilon);
   }
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index c4e2c8bb88ec9c74bd782570c10fb217178c8e48..9cc34bdded780e61e8700eb4fa4a295c84fb48bc 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -16,11 +16,14 @@ limitations under the License. */
 #include <math.h>  // for sqrt in CPU and CUDA
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
+namespace scatter = paddle::operators::math::scatter;
+
 template <typename T>
 struct AdamFunctor {
   T beta1_;
@@ -79,6 +82,69 @@ struct AdamFunctor {
   }
 };
 
+template <typename T>
+struct SparseAdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    for (int64_t j = 0; j < row_numel_; ++j) {
+      T g = grad_[i * row_numel_ + j];
+      T mom1 = moment1_[rows_[i] * row_numel_ + j];
+      T mom2 = moment2_[rows_[i] * row_numel_ + j];
+      T lr = *lr_;
+      T p = param_[rows_[i] * row_numel_ + j];
+
+      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
+      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
+      param_out_[rows_[i] * row_numel_ + j] = p;
+    }  // for col id
+  }
+};
+
 template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
@@ -90,7 +156,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
     T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
     auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
-    auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
     auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
     auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
     auto& lr =
@@ -108,18 +175,48 @@ class AdamOpKernel : public framework::OpKernel<T> {
     auto& mom2_out =
         Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
 
-    AdamFunctor<T> functor(beta1, beta2, epsilon, beta1_pow.template data<T>(),
-                           beta2_pow.template data<T>(),
-                           mom1.template data<T>(),
-                           mom1_out.template mutable_data<T>(ctx.GetPlace()),
-                           mom2.template data<T>(),
-                           mom2_out.template mutable_data<T>(ctx.GetPlace()),
-                           lr.template data<T>(), grad.template data<T>(),
-                           param.template data<T>(),
-                           param_out.template mutable_data<T>(ctx.GetPlace()));
-    platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), param.numel());
-    for_range(functor);
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+      AdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad.template data<T>(),
+          param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()));
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param.numel());
+      for_range(functor);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto& grad =
+          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      // merge duplicated rows if any.
+      scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto grad_merge =
+          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      auto* rows = grad_merge.rows().data();
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+
+      SparseAdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad_data, param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          grad_merge.rows().size());
+      for_range(functor);
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
   }
 };
 
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 060ffac8273724fad48753d9a7e2b0affbb6f25e..e0eef5d9f93d70930ee82d663de9610cc0176e33 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     size_t offset;
     if (platform::is_gpu_place(i_tensor.place())) {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 0aa04c268bd65dc246341a2f335c362496050bdb..49366fee8df5a44a97b7b4e87cbf0b7c813a414a 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         }
         auto slice = out->Slice(out_offset, out_offset + len);
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
 
         framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                             dev_ctx, &slice);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 0560040509026e84eb543331996a6846751b8506..7d77be3be1034bb38f6c92c181aa525214073eec 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
         out != nullptr,
         "The Output(Out) should not be null if the Input(X) is set.");
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
   }
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 49cb0fa4d9d8ada649105a9b14ad960f6cda1a7e..98db28ddee7c6cdb37fe7732649d4fc38de7b873 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -50,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
     PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
 
-    const float epsilon = ctx->Attrs().Get<float>("epsilon");
-    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
-    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
-
     // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
     PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                       "Mean and MeanOut should share the same memory");
@@ -91,7 +87,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "").SetDefault(1e-5);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
     AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
     AddInput("X", "The input tensor");
     AddInput("Scale",
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index 52c28e7f532f9751589176c8d37362620167cf63..72e05607b0b612807d552b4c45b58f9d9ce9c2af 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
       : OperatorBase(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope& scope,
            const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-    auto& dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
 
     framework::ExecutionContext ctx(*this, scope, dev_ctx);
 
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 455fbd8ca3f5083fac51776524daca6f6a029667..e333002bfd1ab40c62882f09cd207a12a0939648 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
 
 void CondOp::Run(const Scope& scope, const platform::Place& place) const {
   // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  auto& dev_ctx = *pool.Borrow(place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
 
   PrepareDataForSubnet(scope, dev_ctx);
   std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 08ff0db08603e28b0fd5db1b0deccfbfdd87c6e5..0aa7dd48cafc3e2387ac902882d84ce9029cfcd0 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
+REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
+                   paddle::operators::CudnnConvOpKernel<float>,
+                   paddle::operators::CudnnConvOpKernel<double>);
+
 REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
                         paddle::operators::CudnnConvOpKernel<float>,
                         paddle::operators::CudnnConvOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index ab52a41b539236f1691ce8bc02d31e336ee4ccbb..e65a5dce52c3c51d3d6bee1684c1e97230203d38 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   int groups = ctx->Attrs().Get<int>("groups");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int input_channels = in_dims[1];
-  int output_channels = filter_dims[0];
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "Conv intput should be 4-D or 5-D tensor.");
@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(
       paddings.size(), strides.size(),
       "Conv paddings dimension and Conv strides dimension should be the same.");
+
+  int input_channels = in_dims[1];
   PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
+
+  int output_channels = filter_dims[0];
   PADDLE_ENFORCE_EQ(
       output_channels % groups, 0,
       "The number of output channels should be divided by groups.");
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index e2b6282c0913e8ad16f8e3f6c3054f9567822d15..eadcca55f9bfc3e59f329df8ff419ad4c5a29007 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -13,19 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
@@ -41,28 +37,25 @@ class CosSimKernel : public framework::OpKernel<T> {
     out_x_norm->mutable_data<T>(context.GetPlace());
     out_y_norm->mutable_data<T>(context.GetPlace());
 
-    // convert Tensor to Eigen Tensor
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenVector<T>::Flatten(*out_z);
-    auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
-    auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
 
-    // compute
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto row_along = Eigen::array<int, 1>({{1}});
-    x_norm.device(place) = x.square().sum(row_along).sqrt();
-    y_norm.device(place) = y.square().sum(row_along).sqrt();
+    int cols = framework::product(in_x->dims()) / rows_x;
+
     if (rows_x == rows_y) {
-      auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
-      z.device(place) = xy / x_norm / y_norm;
+      math::CosSimFunctor<T, true> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
     } else {
-      Eigen::DSizes<int, 2> bcast(rows_x, 1);
-      auto xy = (x * y.broadcast(bcast)).sum(row_along);
-      z.device(place) = xy / x_norm / y_norm.broadcast(bcast);
+      math::CosSimFunctor<T, false> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
     }
   }
 };
@@ -81,62 +74,54 @@ class CosSimGradKernel : public framework::OpKernel<T> {
     auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
     auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
 
-    // convert Tensor to Eigen Tensor
-    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto z = EigenMatrix<T>::Reshape(*in_z, 1);
-    auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
-    auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
-    auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
-
     // compute gradident
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
     int cols = framework::product(in_x->dims()) / rows_x;
-    Eigen::DSizes<int, 2> bcast_cols(1, cols);
-    auto z_bcast = z.broadcast(bcast_cols);
-    auto dz_bcast = dz.broadcast(bcast_cols);
-    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+
     if (rows_x == rows_y) {
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
-      auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
-      // compute dx
       if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
-      // compute dy
       if (out_grad_y) {
-        out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
-        auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast;
-        dy.device(place) = dz_bcast * grad;
+        math::CosSimGradFunctor<T> functor(
+            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
+            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
     } else {
-      Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
-      Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
-      auto y_bcast = y.broadcast(bcast_rows);
-      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
-      auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
-                                 .eval()
-                                 .broadcast(bcast_cols);
-      // compute dx
       if (out_grad_x) {
-        out_grad_x->mutable_data<T>(context.GetPlace());
-        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-        auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-        dx.device(place) = dz_bcast * grad;
+        math::CosSimDxFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
       }
-      // compute dy
       if (out_grad_y) {
         out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenVector<T>::Flatten(*out_grad_y);
-        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
-        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
+        math::SetConstant<DeviceContext, T> set_zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
+
+        math::CosSimDyFunctor<DeviceContext, T> functor;
+        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
+                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
+                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
+                static_cast<size_t>(cols), out_grad_y->data<T>());
       }
     }
   }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index a9c5c7046f8642072f2fdef14e1800ce7b05915b..fe39cb481aa63fa401603d97778b337282511ab5 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
   CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
-             "where N is the batch size and D is the number of classes. "
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
     AddInput("Label",
              "(Tensor), the ground truth which is a 2-D tensor. When "
              "soft_label is set to false, Label is a Tensor<int64> with shape "
              "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x K].");
+             "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
               "[N x 1]. The cross entropy loss.");
diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea44cd32678d7e8a5836c1886cf9c1b4961970aa
--- /dev/null
+++ b/paddle/operators/detection_output_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+namespace paddle {
+namespace operators {
+
+class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Loc",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input predict locations"
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is 4, H and W both are 1.");
+    AddInput("Conf",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input priorbox confidence."
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is the number of classes, H and W both are 1.");
+    AddInput("PriorBox",
+             "(Tensor) The input tensor of detection_output operator."
+             "The format of input tensor is the position and variance "
+             "of the boxes");
+    AddOutput("Out",
+              "(Tensor) The output tensor of detection_output operator.");
+    AddAttr<int>("background_label_id", "(int), The background class index.");
+    AddAttr<int>("num_classes", "(int), The number of the classification.");
+    AddAttr<float>("nms_threshold",
+                   "(float), The Non-maximum suppression threshold.");
+    AddAttr<float>("confidence_threshold",
+                   "(float), The classification confidence threshold.");
+    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
+    AddAttr<int>("nms_top_k",
+                 "(int), The bbox number kept of the NMS’s output.");
+    AddComment(R"DOC(
+          detection output for SSD(single shot multibox detector)
+          Apply the NMS to the output of network and compute the predict
+          bounding box location. The output’s shape of this layer could
+          be zero if there is no valid bounding box.
+        )DOC");
+  }
+};
+
+class DetectionOutputOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of DetectionOutputOp should not be null.");
+    std::vector<int64_t> output_shape({1, 7});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
+                             ops::DetectionOutputOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6560e0492c559afd06e5152c34fab545b7ce61
--- /dev/null
+++ b/paddle/operators/detection_output_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8abd5b6406f05747b87fcfd464baeb705a7f7f2
--- /dev/null
+++ b/paddle/operators/detection_output_op.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/detection_util.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/strided_memcpy.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+inline void transpose_fun(const framework::ExecutionContext& context,
+                          const framework::Tensor& src,
+                          framework::Tensor* dst) {
+  int input_nums = src.dims()[0];
+  int offset = 0;
+  for (int j = 0; j < input_nums; ++j) {
+    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
+    std::vector<int64_t> shape_vec(
+        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
+         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
+    framework::DDim shape(framework::make_ddim(shape_vec));
+    framework::Tensor in_p_tensor_transpose;
+    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
+    std::vector<int> shape_axis({0, 1, 3, 4, 2});
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
+           &in_p_tensor_transpose, shape_axis);
+    auto dst_stride = framework::stride(dst->dims());
+    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
+    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
+                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
+                     dst->data<T>() + offset);
+    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
+  }
+}
+template <typename DeviceContext, typename T>
+class DetectionOutputKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
+    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
+    const framework::Tensor* in_priorbox =
+        context.Input<framework::Tensor>("PriorBox");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int num_classes = context.template Attr<int>("num_classes");
+    int top_k = context.template Attr<int>("top_k");
+    int nms_top_k = context.template Attr<int>("nms_top_k");
+    int background_label_id = context.template Attr<int>("background_label_id");
+    float nms_threshold = context.template Attr<float>("nms_threshold");
+    float confidence_threshold =
+        context.template Attr<float>("confidence_threshold");
+    size_t batch_size = in_conf->dims()[1];
+    int conf_sum_size = in_conf->numel();
+    // for softmax
+    std::vector<int64_t> conf_shape_softmax_vec(
+        {conf_sum_size / num_classes, num_classes});
+    framework::DDim conf_shape_softmax(
+        framework::make_ddim(conf_shape_softmax_vec));
+    // for knchw => nhwc
+    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
+                                        in_loc->dims()[4],
+                                        in_loc->dims()[2] * in_loc->dims()[0]});
+    std::vector<int64_t> conf_shape_vec(
+        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
+         in_conf->dims()[2] * in_conf->dims()[0]});
+    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
+    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
+    framework::Tensor loc_tensor;
+    framework::Tensor conf_tensor;
+    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
+    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    // for cpu
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* priorbox_data = in_priorbox->data<T>();
+    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
+    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
+    conf_tensor.Resize(conf_shape_softmax);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), &conf_tensor,
+        &conf_tensor);
+    T* loc_data = loc_tensor.data<T>();
+    T* conf_data = conf_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
+      framework::CopyFrom(loc_tensor, platform::CPUPlace(),
+                          context.device_context(), &loc_cpu);
+      loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
+      framework::CopyFrom(conf_tensor, platform::CPUPlace(),
+                          context.device_context(), &conf_cpu);
+      conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::CopyFrom(*in_priorbox, platform::CPUPlace(),
+                          context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+    }
+    // get decode bboxes
+    size_t num_priors = in_priorbox->numel() / 8;
+    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
+    for (size_t n = 0; n < batch_size; ++n) {
+      std::vector<operators::math::BBox<T>> decoded_bboxes;
+      for (size_t i = 0; i < num_priors; ++i) {
+        size_t prior_offset = i * 8;
+        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
+        std::vector<math::BBox<T>> prior_bbox_vec;
+        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                      prior_bbox_vec);
+        std::vector<std::vector<T>> prior_bbox_var;
+        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                         prior_bbox_var);
+        std::vector<T> loc_pred_data;
+        for (size_t j = 0; j < 4; ++j)
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
+        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
+            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
+        decoded_bboxes.push_back(bbox);
+      }
+      all_decoded_bboxes.push_back(decoded_bboxes);
+    }
+    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
+    int num_kept = math::GetDetectionIndices<T>(
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
+        all_decoded_bboxes, &all_indices);
+
+    if (num_kept <= 0) {
+      std::vector<int64_t> out_shape_vec({0, 0});
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out->Resize(out_shape);
+      return;
+    }
+    std::vector<int64_t> out_shape_vec({num_kept, 7});
+    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+    out->mutable_data<T>(out_shape, context.GetPlace());
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::CopyFrom(out_cpu, platform::CUDAPlace(),
+                          context.device_context(), out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index d3130c1a7e7cd913dd2d79665df8fef82de83cd4..35cb18797ff66cb87a6658e73ce02b0bfae29baa 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
@@ -47,7 +45,11 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
 
     AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f);
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
@@ -78,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) must not be null.");
 
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index c0b4aaa9196eb23871e2d39cea4f33f53562ffe4..c56930336e865079f1b96df0f35b0a051fe63a27 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -30,16 +30,15 @@ struct MaskGenerator {
   __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
       : dropout_prob(dropout_prob), seed(seed) {}
 
-  __host__ __device__ T operator()(const unsigned int n) const {
+  inline __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
     thrust::uniform_real_distribution<AttrType> dist(0, 1);
     rng.discard(n);
     if (dist(rng) < dropout_prob) {
       return static_cast<T>(0);
-    } else {
-      return static_cast<T>(1);
     }
+    return static_cast<T>(1);
   }
 };
 
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index cecbb7226aedabc3bee63c518d0ab9da496fca11..48da52c3b68879a1da8550a5448090f9f1e715d3 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::CopyFrom(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index fa20a06540efef05d6a15669b1f8dfb8bd5927bc..387d1e0a747f71d85826b52d140c2838112227f6 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index fe0706c4a9da864025737584b72c02cca83c956b..dcd43a30c86b62d79f52ac640f14b295a062146c 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
       out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     math::set_constant(dev_ctx, &out, value);
   }
 };
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index 57b4ec69384a55cef6d6d8d0c0145caae837b3f6..084ba1db62de0a6bf6829f8e9f4c274fb777e879 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
 
     if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
       framework::CopyFrom(tensor, place, dev_ctx, &out);
     }
   }
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index c6228864d7ec042ff99e4521d1d707ba091e8ed5..b1957fb9ce6add8628cb206abf2c569d3f615c85 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/gru_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
@@ -70,7 +71,7 @@ class GRUKernel : public framework::OpKernel<T> {
     }
 
     int frame_size = hidden_dims[1];
-    math::hl_gru_value<T> gru_value;
+    math::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
@@ -89,6 +90,10 @@ class GRUKernel : public framework::OpKernel<T> {
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -101,9 +106,8 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
       math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
       gru_value.prev_out_value = gru_value.output_value;
     }
 
@@ -170,12 +174,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
 
-    math::hl_gru_value<T> gru_value;
+    math::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
-    math::hl_gru_grad<T> gru_grad;
+    math::GRUMetaGrad<T> gru_grad;
     if (weight_grad) {
       gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
@@ -189,6 +193,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_hidden_grad.lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -219,9 +227,8 @@ class GRUGradKernel : public framework::OpKernel<T> {
       }
 
       math::GRUUnitGradFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
-          math::ActiveType(context.Attr<std::string>("activation")),
-          math::ActiveType(context.Attr<std::string>("gate_activation")));
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
+          active_gate);
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 5425375c1ffa16c223ec1b05506ffb4b4394d6e1..08b972a233aab8596a5ce7f74ea903df3b8ef0f2 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-    framework::DeserializeFromStream(fin, tensor);
+    DeserializeFromStream(fin, tensor);
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index ed99915bb7fc312cac955b50ffbc3237aee503b0..8d164b4abc54722a95a176dfe8ed341f8c5125d1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         auto slice = out[i].Slice(static_cast<int>(offset),
                                   static_cast<int>(offset + len));
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-        auto &dev_ctx = *pool.Borrow(place);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
 
         framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                     static_cast<int>(each_range.end)),
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index bf47879f772a3013bd7ce78c6f8a6aefe65298f9..7ebcfb9ab9f30e3b0f13d3646a59d008335b232d 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -9,13 +9,14 @@ if(WITH_GPU)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
     nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
     nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -23,13 +24,14 @@ else()
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
     cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
     cc_library(unpooling SRCS unpooling.cc DEPS device_context)
     cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6af9f0fcd967b4e8e9e338c155d5a8ee2866fbfa
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
+template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6eb0a4ea4c5b86f84c93b97615255adf55e9e042
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
+                               const T* y, const T* z, const T* dz,
+                               const size_t rows, const size_t cols, T* dy) {
+  int grid_size = blockDim.x * gridDim.x;
+  T y_norm_data = y_norm[0];
+  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
+       row_id += grid_size) {
+    T xy_norm_prod = x_norm[row_id] * y_norm_data;
+    T dz_data = dz[row_id];
+    T z_data = z[row_id];
+    const T* x_data = x + cols * row_id;
+    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+    T y_norm_square = y_norm_data * y_norm_data;
+    T reciprocal_y_norm_square = 1 / y_norm_square;
+    for (size_t i = 0; i < cols; ++i) {
+      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                             z_data * y[i] * reciprocal_y_norm_square);
+      platform::CudaAtomicAdd(dy + i, dy_data);
+    }
+  }
+}
+
+template <typename T>
+struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    const int block_size = 512;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, (rows + block_size - 1) / block_size);
+    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
+  }
+};
+
+template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
+template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..aae8ab5b7a937c016e8a45e34b22aba7a1df3066
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                    const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                  const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename DeviceContext, typename T>
+struct CosSimDyFunctor {
+  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
+                  const T* x, const T* y, const T* z, const T* dz,
+                  const size_t rows, const size_t cols, T* dy) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 4c67dec9cbeb48f400f79f5ed7ba3c939fa2540c..a61b232f4275d93cae1d9a71d49a779216c3555b 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -28,7 +28,7 @@ template <class OpResetOutput, typename T>
 void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
                                        T *gate_value, T *reset_output_value,
                                        T *prev_output_value, int frame_size,
-                                       activation_mode_t active_gate) {
+                                       ActivationType active_gate) {
   T r_value_update_gate;
   T r_value_reset_gate;
   T r_value_reset_output;
@@ -56,7 +56,7 @@ template <class OpFinalOutput, typename T>
 void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
                                        T *gate_value, T *prev_output_value,
                                        T *output_value, int frame_size,
-                                       activation_mode_t active_node) {
+                                       ActivationType active_node) {
   T r_value_update_gate;
   T r_value_frame_state;
   T r_prev_out = 0;
@@ -83,7 +83,7 @@ template <class OpResetOutput, typename T>
 void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
                                      T *gate_value, T *reset_output_value,
                                      T *prev_output_value, int frame_size,
-                                     activation_mode_t active_gate) {
+                                     ActivationType active_gate) {
 #ifdef __AVX__
   __m256 r_value_update_gate;
   __m256 r_value_reset_gate;
@@ -113,7 +113,7 @@ template <class OpFinalOutput, typename T>
 void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
                                      T *gate_value, T *prev_output_value,
                                      T *output_value, int frame_size,
-                                     activation_mode_t active_node) {
+                                     ActivationType active_node) {
 #ifdef __AVX__
   __m256 r_value_update_gate;
   __m256 r_value_frame_state;
@@ -140,9 +140,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 
 template <class OpResetOutput, typename T>
 inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 hl_gru_value<T> value, int frame_size,
-                                 int batch_size,
-                                 activation_mode_t active_gate) {
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
   for (int b = 0; b < batch_size; b++) {
     if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_reset_output(
@@ -164,9 +163,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
 
 template <class OpFinalOutput, typename T>
 inline void forward_final_output(OpFinalOutput op_final_output,
-                                 hl_gru_value<T> value, int frame_size,
-                                 int batch_size,
-                                 activation_mode_t active_node) {
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
   for (int b = 0; b < batch_size; b++) {
     if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
@@ -191,7 +189,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
                                       T *gate_grad, T *prev_out_value,
                                       T *prev_out_grad, T *output_grad,
                                       int frame_size,
-                                      activation_mode_t active_node) {
+                                      ActivationType active_node) {
   T r_update_gate_value;
   T r_update_gate_grad;
   T r_frame_state_value;
@@ -232,7 +230,7 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
                                       T *gate_grad, T *prev_out_value,
                                       T *prev_out_grad, T *reset_output_grad,
                                       int frame_size,
-                                      activation_mode_t active_gate) {
+                                      ActivationType active_gate) {
   T r_update_gate_value;
   T r_update_gate_grad;
   T r_reset_gate_value;
@@ -277,7 +275,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
                                     T *gate_grad, T *prev_out_value,
                                     T *prev_out_grad, T *output_grad,
                                     int frame_size,
-                                    activation_mode_t active_node) {
+                                    ActivationType active_node) {
 #ifdef __AVX__
   __m256 r_update_gate_value;
   __m256 r_update_gate_grad;
@@ -320,7 +318,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
                                     T *gate_grad, T *prev_out_value,
                                     T *prev_out_grad, T *reset_output_grad,
                                     int frame_size,
-                                    activation_mode_t active_gate) {
+                                    ActivationType active_gate) {
 #ifdef __AVX__
   __m256 r_update_gate_value;
   __m256 r_update_gate_grad;
@@ -364,9 +362,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
 
 template <class OpStateGrad, typename T>
 inline void backward_state_grad(OpStateGrad op_state_grad,
-                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                                 int frame_size, int batch_size,
-                                activation_mode_t active_node) {
+                                ActivationType active_node) {
   for (int b = 0; b < batch_size; b++) {
     if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_state_grad(
@@ -393,9 +391,9 @@ inline void backward_state_grad(OpStateGrad op_state_grad,
 
 template <class OpResetGrad, typename T>
 inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                                 int frame_size, int batch_size,
-                                activation_mode_t active_gate) {
+                                ActivationType active_gate) {
   for (int b = 0; b < batch_size; b++) {
     if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_reset_grad(
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index d2edcb7f258b387530799b967fc0fff61acc5b83..1783d46096858c874b27ce75760342082835b180 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/platform/cuda_helper.h"
 #include "paddle/platform/device_context.h"
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -35,7 +33,7 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
                                         T *gate_value, T *reset_output_value,
                                         T *prev_output_value, int frame_size,
                                         int batch_size,
-                                        activation_mode_t active_gate) {
+                                        ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
 
@@ -74,7 +72,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
                                         T *gate_value, T *prev_output_value,
                                         T *output_value, int frame_size,
                                         int batch_size,
-                                        activation_mode_t active_node) {
+                                        ActivationType active_node) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
@@ -111,7 +109,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
                                        T *gate_grad, T *prev_out_value,
                                        T *prev_out_grad, T *output_grad,
                                        int frame_size, int batch_size,
-                                       activation_mode_t active_node) {
+                                       ActivationType active_node) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
@@ -159,7 +157,7 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
                                        T *gate_grad, T *prev_out_value,
                                        T *prev_out_grad, T *reset_output_grad,
                                        int frame_size, int batch_size,
-                                       activation_mode_t active_gate) {
+                                       ActivationType active_gate) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index acd84be01db9ddaf06d165d8be353b253f324dd2..4d8245cb5d03b33edbda5d8350be02b4fa87ab95 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -30,7 +30,7 @@ class gru_resetOutput {
  public:
   HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
                              T &prev_out, T &value_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
     value_update_gate = activation(value_update_gate, act_gate);
     value_reset_gate = activation(value_reset_gate, act_gate);
     value_reset_output = prev_out * value_reset_gate;
@@ -43,7 +43,7 @@ class gru_resetOutput {
   HOSTDEVICE void operator()(__m256 &value_update_gate,
                              __m256 &value_reset_gate, __m256 &prev_out,
                              __m256 &value_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
     value_update_gate = activation(value_update_gate, act_gate);
     value_reset_gate = activation(value_reset_gate, act_gate);
     value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
@@ -57,7 +57,7 @@ class gru_finalOutput {
  public:
   HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
                              T &prev_out, T &value_output,
-                             activation_mode_t act_input) {
+                             ActivationType act_input) {
     value_frame_state = activation(value_frame_state, act_input);
     value_output = prev_out - (value_update_gate * prev_out) +
                    (value_update_gate * value_frame_state);
@@ -69,8 +69,7 @@ class gru_finalOutput {
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 &value_update_gate,
                              __m256 &value_frame_state, __m256 &prev_out,
-                             __m256 &value_output,
-                             activation_mode_t act_input) {
+                             __m256 &value_output, ActivationType act_input) {
     value_frame_state = activation(value_frame_state, act_input);
     value_output = _mm256_add_ps(
         _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
@@ -89,7 +88,7 @@ class gru_stateGrad {
   HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
                              T &value_frame_state, T &grad_frame_state,
                              T &value_prev_out, T &grad_prev_out,
-                             T &grad_output, activation_mode_t act_input) {
+                             T &grad_output, ActivationType act_input) {
     grad_update_gate = (grad_output * value_frame_state);
     grad_update_gate -= (grad_output * value_prev_out);
     grad_prev_out -= (grad_output * value_update_gate);
@@ -107,7 +106,7 @@ class gru_stateGrad {
                              __m256 &value_frame_state,
                              __m256 &grad_frame_state, __m256 &value_prev_out,
                              __m256 &grad_prev_out, __m256 &grad_output,
-                             activation_mode_t act_input) {
+                             ActivationType act_input) {
     grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
     grad_update_gate = _mm256_sub_ps(
         grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
@@ -128,7 +127,7 @@ class gru_resetGrad {
   HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
                              T &value_reset_gate, T &grad_reset_gate,
                              T &value_prev_out, T &grad_prev_out,
-                             T &grad_reset_output, activation_mode_t act_gate) {
+                             T &grad_reset_output, ActivationType act_gate) {
     grad_reset_gate = (grad_reset_output * value_prev_out);
     grad_prev_out += (grad_reset_output * value_reset_gate);
     grad_update_gate =
@@ -144,7 +143,7 @@ class gru_resetGrad {
                              __m256 &grad_update_gate, __m256 &value_reset_gate,
                              __m256 &grad_reset_gate, __m256 &value_prev_out,
                              __m256 &grad_prev_out, __m256 &grad_reset_output,
-                             activation_mode_t act_gate) {
+                             ActivationType act_gate) {
     grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
     grad_prev_out = _mm256_add_ps(
         grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3a3ef2badc37924d866ded8ee7a7338fbc4b2d2
--- /dev/null
+++ b/paddle/operators/math/detection_util.h
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct BBox {
+  BBox(T x_min, T y_min, T x_max, T y_max)
+      : x_min(x_min),
+        y_min(y_min),
+        x_max(x_max),
+        y_max(y_max),
+        is_difficult(false) {}
+
+  BBox() {}
+
+  T get_width() const { return x_max - x_min; }
+
+  T get_height() const { return y_max - y_min; }
+
+  T get_center_x() const { return (x_min + x_max) / 2; }
+
+  T get_center_y() const { return (y_min + y_max) / 2; }
+
+  T get_area() const { return get_width() * get_height(); }
+
+  // coordinate of bounding box
+  T x_min;
+  T y_min;
+  T x_max;
+  T y_max;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool is_difficult;
+};
+// KNCHW ==> NHWC
+// template <typename T>
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec);
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec);
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data);
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2);
+template <typename T>
+bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
+                          const std::pair<T, BBox<T>>& pair2);
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices);
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox);
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec) {
+  size_t out_offset = bbox_vec.size();
+  bbox_vec.resize(bbox_vec.size() + num_bboxes);
+  for (size_t i = 0; i < num_bboxes; ++i) {
+    BBox<T> bbox;
+    bbox.x_min = *(prior_data + i * 8);
+    bbox.y_min = *(prior_data + i * 8 + 1);
+    bbox.x_max = *(prior_data + i * 8 + 2);
+    bbox.y_max = *(prior_data + i * 8 + 3);
+    bbox_vec[out_offset + i] = bbox;
+  }
+}
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec) {
+  size_t out_offset = var_vec.size();
+  var_vec.resize(var_vec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    std::vector<T> var;
+    var.push_back(*(prior_data + i * 8 + 4));
+    var.push_back(*(prior_data + i * 8 + 5));
+    var.push_back(*(prior_data + i * 8 + 6));
+    var.push_back(*(prior_data + i * 8 + 7));
+    var_vec[out_offset + i] = var;
+  }
+}
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data) {
+  T prior_bbox_width = prior_bbox.get_width();
+  T prior_bbox_height = prior_bbox.get_height();
+  T prior_bbox_center_x = prior_bbox.get_center_x();
+  T prior_bbox_center_y = prior_bbox.get_center_y();
+
+  T decoded_bbox_center_x =
+      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
+      prior_bbox_center_x;
+  T decoded_bbox_center_y =
+      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
+      prior_bbox_center_y;
+  T decoded_bbox_width =
+      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
+  T decoded_bbox_height =
+      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
+
+  BBox<T> decoded_bbox;
+  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
+  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
+  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
+  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
+
+  return decoded_bbox;
+}
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
+  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
+      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
+    return 0.0;
+  } else {
+    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
+    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
+    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
+    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
+
+    T inter_width = interX_max - inter_x_min;
+    T inter_height = interY_max - inter_y_min;
+    T inter_area = inter_width * inter_height;
+
+    T bbox_area1 = bbox1.get_area();
+    T bbox_area2 = bbox2.get_area();
+
+    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+  }
+}
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices) {
+  std::vector<std::pair<T, size_t>> scores;
+  for (size_t i = 0; i < num_priors; ++i) {
+    size_t conf_offset = i * num_classes + class_idx;
+    if (conf_score_data[conf_offset] > conf_threshold)
+      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(),
+                   SortScorePairDescend<T, size_t>);
+  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t saved_idx = (*indices)[i];
+        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
+        keep = overlap <= nms_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
+  int total_keep_num = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+    size_t num_detected = 0;
+    std::map<size_t, std::vector<size_t>> indices;
+    size_t conf_offset = n * num_priors * num_classes;
+    for (size_t c = 0; c < num_classes; ++c) {
+      if (c == background_label_id) continue;
+      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
+                      conf_threshold, nms_threshold, num_priors, num_classes,
+                      &(indices[c]));
+      num_detected += indices[c].size();
+    }
+    if (top_k > 0 && num_detected > top_k) {
+      // std::vector<pair<T,T>> score_index_pairs;
+      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
+      for (size_t c = 0; c < num_classes; ++c) {
+        const std::vector<size_t>& label_indices = indices[c];
+        for (size_t i = 0; i < label_indices.size(); ++i) {
+          size_t idx = label_indices[i];
+          score_index_pairs.push_back(
+              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<T, std::pair<size_t, size_t>>);
+      score_index_pairs.resize(top_k);
+      std::map<size_t, std::vector<size_t>> new_indices;
+      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
+        size_t label = score_index_pairs[i].second.first;
+        size_t idx = score_index_pairs[i].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_detection_indices->push_back(new_indices);
+      total_keep_num += top_k;
+    } else {
+      all_detection_indices->push_back(indices);
+      total_keep_num += num_detected;
+    }
+  }
+  return total_keep_num;
+}
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox) {
+  T one = static_cast<T>(1.0);
+  T zero = static_cast<T>(0.0);
+  BBox<T> clipped_bbox;
+  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
+  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
+  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
+  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
+  return clipped_bbox;
+}
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
+  size_t count = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
+             all_indices[n].begin();
+         it != all_indices[n].end(); ++it) {
+      size_t label = it->first;
+      const std::vector<size_t>& indices = it->second;
+      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
+        out_data[count * 7] = n;
+        out_data[count * 7 + 1] = label;
+        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
+        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
+        out_data[count * 7 + 3] = clipped_bbox.x_min;
+        out_data[count * 7 + 4] = clipped_bbox.y_min;
+        out_data[count * 7 + 5] = clipped_bbox.x_max;
+        out_data[count * 7 + 6] = clipped_bbox.y_max;
+        ++count;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index d570c68cd458914c8951c4ce50a02e3c5b1acab0..101ab859624869bf34d171cd42d46d0c5bdac29c 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,9 +21,9 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
     if (value.prev_out_value) {
       math::gemm<platform::CPUDeviceContext, T>(
@@ -51,10 +51,10 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                 grad, frame_size, batch_size, active_node);
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index dd518cd1e4bea52f0d463150114feed3ceea0ccb..d5a0e630ea0eadea990988c3170395c842a91900 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,9 +21,9 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
     auto stream = context.stream();
     dim3 threads;
     dim3 grid;
@@ -88,10 +88,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                       int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate) {
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
     auto stream = context.stream();
     dim3 threads;
     dim3 grid;
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index ca1343cb2c5c1eb8da92c2f06b25902c1c2fe8b3..bf69147b506661692a6d71823043cd3506ea8b5d 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -19,9 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-// TODO(guosheng): refine code style in gru_compute
 template <typename T>
-struct hl_gru_value {
+struct GRUMetaValue {
   T *gate_weight;
   T *state_weight;
   T *gate_value;
@@ -31,7 +30,7 @@ struct hl_gru_value {
 };
 
 template <typename T>
-struct hl_gru_grad {
+struct GRUMetaGrad {
   T *gate_weight_grad;
   T *state_weight_grad;
   T *gate_grad;
@@ -42,18 +41,18 @@ struct hl_gru_grad {
 
 template <typename DeviceContext, typename T>
 struct GRUUnitFunctor {
-  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                       int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate);
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };
 
 template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctor {
-  static void compute(const DeviceContext &context, hl_gru_value<T> value,
-                      hl_gru_grad<T> grad, int frame_size, int batch_size,
-                      activation_mode_t active_node,
-                      activation_mode_t active_gate);
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 954762f92286fe13bd2c08ec03c3ac96bb663cca..e1ad6b64d201ef99d83eaa2a821356008dcc9c8e 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -22,14 +22,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} activation_mode_t;
-
 template <class T>
 struct LstmMetaValue {
   T *gate_value;
@@ -54,20 +46,6 @@ struct LstmMetaGrad {
   T *check_og_grad;
 };
 
-inline activation_mode_t ActiveType(const std::string &type) {
-  if (type == "sigmoid") {
-    return HL_ACTIVATION_SIGMOID;
-  } else if (type == "relu") {
-    return HL_ACTIVATION_RELU;
-  } else if (type == "tanh") {
-    return HL_ACTIVATION_TANH;
-  } else if (type == "linear" || type == "identity" || type == "") {
-    return HL_ACTIVATION_LINEAR;
-  } else {
-    PADDLE_THROW("Do not support activation type.");
-  }
-}
-
 template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2b35e4532a9c9f72f473020d472244234af24248..dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -245,9 +245,12 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                        \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -302,8 +305,29 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
 
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
 template struct RowwiseAdd<platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 927838a0948d2df5701b8e9189f59cdd66396b52..d47a7f818ded61baf31e46ea3b8ae3101324111f 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
                            TensorSetConstantGPU(context, tensor, value));
 }
 
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index ddd798dace17012b7d9a949567a90d48067e6b15..de591626df28e2bc3391b609f909612411398247 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
   eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }
 
-template <typename DeviceContext, typename T>
-void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              const framework::Tensor& vector,
-                                              framework::Tensor* output) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector.numel(), size);
-  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenMatrix<T>::From(vector);
-  auto out = framework::EigenMatrix<T>::From(*output);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.eigen_device()) =
-      in + vec.reshape(shape).broadcast(bcast);
-}
-
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index ab758d1e7fd8ab361948b28e8cb735b9a742a339..8a1ebb58c26578f076bf243adfbd51d10c682b99 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/selected_rows_functor.h"
+#include <set>
+
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -179,6 +181,118 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+
+template <typename T>
+struct MergeAdd<platform::CPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      for (int64_t j = 0; j < input_width; j++) {
+        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+      }
+    }
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext, int>;
+template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct UpdateToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 9fddd97a36f7fdb6628d6eeb192cb216fdae3e5b..0ee456f9bc61436bd0f2f8ef20dd1654e7e56d56 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <set>
+
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"
@@ -222,6 +224,157 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+
+namespace scatter {
+
+template <typename T, int block_size>
+__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
+                               T* out, const int64_t* out_rows,
+                               size_t out_rows_size, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t out_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < out_rows_size; i++) {
+      if (input_rows[ty] == out_rows[i]) {
+        out_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  input += ty * row_numel;
+  out += out_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+  }
+}
+
+template <typename T>
+struct MergeAdd<platform::CUDADeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, input_rows.size());
+
+    MergeAddKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(input_data, input.rows().data(), out_data,
+                                   out.rows().data(), out.rows().size(),
+                                   input_width);
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CUDADeviceContext, float>;
+template struct MergeAdd<platform::CUDADeviceContext, double>;
+template struct MergeAdd<platform::CUDADeviceContext, int>;
+template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+
+template <typename T, int block_size>
+__global__ void UpdateToTensorKernel(const T* selected_rows,
+                                     const int64_t* rows, const ScatterOps& op,
+                                     T* tensor_out, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+  // FIXME(typhoonzero): use macro fix the below messy code.
+  switch (op) {
+    case ScatterOps::ASSIGN:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index];
+      }
+      break;
+    case ScatterOps::ADD:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] += selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUB:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] -= selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUBBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] - tensor_out[index];
+      }
+      break;
+    case ScatterOps::MUL:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] *= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIV:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] /= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIVBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] / tensor_out[index];
+      }
+      break;
+  }
+}
+
+template <typename T>
+struct UpdateToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    // NOTE: Use SelectedRowsAddToTensor for better performance
+    //       no additional MergeAdd called.
+    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto merged_in1 = merge_func(context, input1);
+
+    auto in1_height = merged_in1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = merged_in1.value();
+    auto& in1_rows = merged_in1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.template data<T>();
+    auto* in2_data = input2->data<T>();
+
+    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 grid(1, in1_rows.size());
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
+                                              in2_data, in1_row_numel);
+  }
+};
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
index 1149075abf16547a120ac8928c45b4972409fc72..09d4631905f90f78772368ad71b11826877bdc34 100644
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/platform/device_context.h"
 
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -52,6 +57,78 @@ struct SelectedRowsAddToTensor {
                   framework::Tensor* input2);
 };
 
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <typename DeviceContext, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input);
+};
+
+template <typename DeviceContext, typename T>
+struct Add {
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+    return out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct Mul {
+  // multiply two SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+    return out;
+  }
+  // multiply scalar to SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const T input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    e_out.device(*context.eigen_device()) = input2 * e_in1;
+    return out;
+  }
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename DeviceContext, typename T>
+struct UpdateToTensor {
+  void operator()(const DeviceContext& context, const ScatterOps& op,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+}  // namespace scatter
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 2287f347910e83c25d2155b80670f9d991c1e5b2..3f999e404f8afe6bded09c820509fa0f36d30bf6 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
 
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 34a6e1a58d4bb8b267f3b6b3a0ac689c17036c37..6546096069d4c3fbc4908a16c2dba2ac6d7e6421 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
 
diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b198b76cd49ca7c05b047d42df149d2b1e461b8e
--- /dev/null
+++ b/paddle/operators/norm_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/norm_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename AttrType>
+class NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of norm operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Scale",
+             "(Tensor) The input tensor of norm operator. "
+             "The format of input tensor is C * 1.");
+    AddAttr<AttrType>("epsilon",
+                      "(float, default 1e-10) Constant "
+                      "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddOutput("Out",
+              "(Tensor) The output tensor of norm operator."
+              "N * M."
+              "M = C * H * W");
+    AddComment(R"DOC(
+       "Input shape: $(N, C, H, W)$
+        Sclae shape: $(C, 1)$
+        Output shape: $(N, C, H, W)$
+        Where
+        forward
+          $$
+            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
+          $$
+        backward
+          $$
+            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
+          $$
+        )DOC");
+  }
+};
+
+class NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of NormOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", in_x_dims);
+  }
+};
+
+class NormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+            ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2941c89b93141388d3746cf128848960ed6f3625
--- /dev/null
+++ b/paddle/operators/norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bee48919e9fdf85595bbc7ad540ca45a1dbfe5c
--- /dev/null
+++ b/paddle/operators/norm_op.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor out_batch = out->Slice(n, n + 1);
+      auto out_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              out_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
+                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
+      // get colsum  and sqrt , inverse
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp.device(*place) = x_square_batch_eigen.sum(dim);
+      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      out_batch_eigen.device(*place) =
+          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      out_batch_eigen.device(*place) =
+          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+      auto in_g_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_g_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+      auto outg_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              outg_batch, framework::make_ddim({channels, fea_len}));
+
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
+      framework::Tensor norm_tmp_tensor;
+      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                      context.GetPlace());
+      auto norm_tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
+      norm_tmp_eigen.device(*place) =
+          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      in_g_batch_eigen.device(*place) =
+          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen /
+          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
+      // outg_batch_eigen + (in_g_batch_eigen * -1);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 71769e67c7032f2d808d1394883cbe93f826b2f6..056fa46949cd623845956521b068109085a8795e 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
                    false /*create_local_scope*/);
 
       // get device context from pool
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
 
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
     auto *program = block->Program();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 1063388e2539d47ca6ab56cd5fb0b946b6fb9147..8d652ff806461cea3d0e8d3bd70704b4b6bc2173 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
     auto x_sliced = x.Slice(x_offset, x_offset + len);
     auto out_sliced = out->Slice(out_offset, out_offset + len);
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
     framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
     out_offset += len;
     return out_offset;
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index d045a8b5b8d852278fd5140b9cf8707462b93c93..4b1cbe88836e340c94f797806243a6768410ed3d 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
     auto &tensor = var->Get<framework::LoDTensor>();
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     framework::SerializeToStream(fout, tensor, dev_ctx);
   }
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 6e829386832a875769669b02e6f41755618d348a..95c207221a7b34732eca4cfd07fed0a8f1671981 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Recv operator
 
-This operator will recv tensor from send_op
+This operator will send tensor to recv_op.
 )DOC");
     AddAttr<std::vector<std::string>>("endpoints",
                                       "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index e8a4773547861478b1771562ecd9e8b0bb3c3739..b37269b471b4d71b42c41641fd14c7a64d2719d6 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
 
     // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     if (dout_var == nullptr) {  // dx_tensor fill zero
       math::set_constant(dev_ctx, &dx_tensor, 0.0f);
@@ -116,9 +116,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       auto height = dout_tensor.dims()[0];
       auto slice = dx_tensor.Slice(0, static_cast<int>(height));
       framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
-      if (dx_tensor.dims()[0] < height) {
+      if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
-            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
         math::set_constant(dev_ctx, &rest_tensor, 0.0f);
       }
     }
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index 89826ca6ee98d579f8b7c8795b6dc33cfa158ee1..2d8787d740c70f1d4696fdec381b572ecf031f57 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     auto &x_lod = x.lod();
     auto &mask_dim = mask.dims();
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
 
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index eaa36aa1aea53e0b37ef6c578d8bb1cda230ded0..552b48f608b7e0248f03dbea940a83f112a67712 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -37,11 +37,11 @@ class SumKernel : public framework::OpKernel<T> {
     bool in_place = out_var == in_vars[0];
 
     if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = context.Output<Tensor>("Out");
-      out->mutable_data<T>(context.GetPlace());
-
+      auto *out = context.Output<LoDTensor>("Out");
+      if (!in_place) {
+        out->mutable_data<T>(context.GetPlace());
+      }
       auto result = EigenVector<T>::Flatten(*out);
-
       if (!in_place) {
         math::SetConstant<DeviceContext, T> constant_functor;
         constant_functor(context.template device_context<DeviceContext>(), out,
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 9529aab573a73f55f7a67d39cde99aca023f473e..d5ff3e3fce29b1a888b2cd4d307c2655669e3e4c 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
     if (x_tensor.memory_size() > 0) {
       auto *out_tensor = &out->at(offset);
 
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
 
       CopyFrom(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
@@ -129,11 +130,12 @@ class ReadFromArrayOp : public ArrayOp {
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
     size_t offset = GetOffset(scope, place);
     if (offset < x_array.size()) {
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-      auto &dev_ctx = *pool.Borrow(place);
+      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
       framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 728ef6079465d57f54dab383aac5e2bb750fe113..65d827e0e0c5cfc3897c1fd0b971b766201cc1e2 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -25,12 +25,12 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kStepBlock[] = "sub_block";
-constexpr char kCondition[] = "Condition";
-constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParameters[] = "X";
-constexpr char kParamGrads[] = "X@GRAD";
-constexpr char kOutputs[] = "Out";
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kParameters,
+    AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of While Op.")
         .AsDuplicable();
@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase {
 
       executor.Run(*program, *cur_scope_iter, block->ID(), false);
 
-      auto &pg_names = Outputs(kParamGrads);
-      auto &p_names = Inputs(kParameters);
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
       PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
       for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
         if (pg_names[param_id] == framework::kEmptyVarName) {
@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto *grad = new framework::OpDesc();
     grad->SetType("while_grad");
-    grad->SetInput(kParameters, Input(kParameters));
+    grad->SetInput(kX, Input(kX));
 
     // Not all of IGs will be generated by inner gradient operators of while op.
     // Ignore IGs that is not generated by the inside block.
-    auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
     std::unordered_set<std::string> all_outs;
     for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
       for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
 
-    grad->SetOutput(framework::GradVarName(kParameters), igs);
+    grad->SetOutput(framework::GradVarName(kX), igs);
 
     grad->SetInput(kOutputs, Output(kOutputs));
 
@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     std::unordered_set<std::string> block_ins;
     auto *fwd_block = this->grad_block_[0]->ParentBlock();
     {
-      for (auto &p : Input(kParameters)) {
+      for (auto &p : Input(kX)) {
         block_ins.insert(p);
       }
       for (auto &o : Output(kOutputs)) {
@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kParameters);
-    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 class WhileGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    ctx->HasInputs(kParameters);
-    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
     ctx->HasInputs(kOutputs);
     ctx->HasInputs(framework::GradVarName(kOutputs));
 
-    auto p_names = ctx->Inputs(kParameters);
-    auto pg_names = ctx->Outputs(kParamGrads);
-    auto var_types = ctx->GetInputsVarType(kParameters);
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
     std::vector<std::string> names_to_set;
     std::vector<framework::DDim> dims_to_set;
     for (size_t i = 0; i < p_names.size(); ++i) {
       if (pg_names[i] == framework::kEmptyVarName) {
         continue;
       }
-      auto dims = ctx->GetInputsElementDim(kParameters, i);
+      auto dims = ctx->GetInputsElementDim(kX, i);
       if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index e450ef32a432a9dede05835341a166454a551889..ea07f2e002cb76d09a11f7a5305c2d45b780e7bd 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -17,7 +17,7 @@ namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-const platform::DeviceContext* DeviceContextPool::Borrow(
+const platform::DeviceContext* DeviceContextPool::Get(
     const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
   return it->second;
 }
 
-std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-  std::vector<const platform::DeviceContext*> borrowed_contexts;
-  for (auto& place : places) {
-    auto it = device_contexts_.find(place);
-    if (it != device_contexts_.end()) {
-      borrowed_contexts.emplace_back(it->second);
-    } else {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-  }
-  return borrowed_contexts;
-}
-
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 063f281f6758bd5bb70d73d1e97d2145fb2aac86..2b366e6383d23e2d31a194edd04412892a8311eb 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
+template <typename Place>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
+};
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
+
 class CUDNNDeviceContext : public CUDADeviceContext {
  public:
   explicit CUDNNDeviceContext(CUDAPlace place);
@@ -102,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
   cudnnHandle_t cudnn_handle_;
 };
 
-class DeviceGuard {
- public:
-  explicit DeviceGuard(int device) {
-    original_device_ = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(device);
-  }
-  ~DeviceGuard() { platform::SetDeviceId(original_device_); }
-
- private:
-  int original_device_;
-};
-
 #endif
 
 /*! \brief device context pool singleton */
@@ -121,13 +122,13 @@ class DeviceContextPool {
  public:
   explicit DeviceContextPool(const std::vector<platform::Place>& places);
 
-  static DeviceContextPool& Get() {
+  static DeviceContextPool& Instance() {
     PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
     return *pool;
   }
 
   /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
     if (pool == nullptr) {
       pool = new DeviceContextPool(places);
     }
@@ -135,13 +136,14 @@ class DeviceContextPool {
   }
 
   /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Borrow(const platform::Place& place);
-
-  /*! \brief  Return handle of multi-device context. */
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places);
+  const platform::DeviceContext* Get(const platform::Place& place);
 
-  ~DeviceContextPool() {}
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
 
  private:
   static DeviceContextPool* pool;
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
index 91011bf71ccf4295075aa2ed73dffa92cb22d773..ca10cf34639376798bf5ba05970c9c734e5a1ef8 100644
--- a/paddle/platform/device_context_test.cu
+++ b/paddle/platform/device_context_test.cu
@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
   using paddle::platform::CPUPlace;
   using paddle::platform::CUDAPlace;
 
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
-  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
 
   std::vector<Place> gpu_places;
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; ++i) {
-    gpu_places.emplace_back(CUDAPlace(i));
-  }
-  auto dev_ctxs = pool.Borrow(gpu_places);
-  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
-    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
-
-    // check same as CUDAPlace(i)
-    CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
-    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
   }
 }
 
 int main(int argc, char** argv) {
-  int dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
-                    "device count is "
-                 << dev_count;
-    return 0;
-  }
-
   std::vector<paddle::platform::Place> places;
 
   places.emplace_back(paddle::platform::CPUPlace());
@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h
index 5427aa28238d6b46eb72d1fb49303dce3d871d7d..694a66d9ac4eb6ad02daf1931806fa1287de7cab 100644
--- a/paddle/platform/for_range.h
+++ b/paddle/platform/for_range.h
@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {
 
   template <typename Function>
   inline void operator()(Function func) const {
-    constexpr size_t num_threads = 1024;
+    constexpr int num_threads = 1024;
     int block_size = limit_ <= num_threads ? limit_ : num_threads;
     int grid_size = (limit_ + num_threads - 1) / num_threads;
 
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index 8f815863a72b118a0a258a2aa4276a33b95b0e46..ef6d845874745af1150e4425f8d6be416cc44ece 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
   }
 
   VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
 
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index d25eaa689f4a4baa951db5c61bbf99288e365ee1..76b5c502cc48431a4e9b13b07505978884576e1d 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <iostream>
-
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"
 
 namespace paddle {
@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
 
+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 3d95097048c8cc7db720fc4b8a00dab6256513e5..64b8bd1485a8077474bb7e60d55b0924c576a83b 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -3,7 +3,7 @@
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
+`
     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
@@ -18,34 +18,134 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-ProfilerState kState = ProfilerState::kDisabled;
-uint32_t kNextThreadId = 0;
-std::mutex kAllEventListsMutex;
-std::list<std::shared_ptr<EventList>> kAllEventLists;
-thread_local std::shared_ptr<EventList> kEventList;
-thread_local int32_t kThreadId;
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind),
+      name_(std::move(name)),
+      thread_id_(thread_id),
+      has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedUs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+}
+
+double Event::CudaElapsedUs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms * 1000.0;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+                        dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  name_ = name;
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+                        dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  GetEventList().Record(EventKind::kPopRange, std::move(name_), g_thread_id,
+                        dev_ctx_);
+}
 
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(kState == ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
-  kState = state;
+  g_state = state;
 #ifdef PADDLE_WITH_CUDA
-  auto ForEachDevice = [](std::function<void(int)> op) {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; i++) {
-      DeviceGuard dev_guard(i);
-      op(i);
-    }
-  };
-  if (kState == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
     for (int i = 0; i < 5; i++) {
       ForEachDevice([](int d) {
-        DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d));
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
         Mark("_cuda_startup_", dev_ctx);
         dev_ctx->Wait();
       });
@@ -53,35 +153,36 @@ void EnableProfiler(ProfilerState state) {
   }
 #endif
   // Mark the profiling start.
-  Mark("_start_profiler_");
+  Mark("_start_profiler_", nullptr);
 }
 
 std::vector<std::vector<Event>> DisableProfiler() {
-  PADDLE_ENFORCE(kState != ProfilerState::kDisabled,
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                  "Can't disable profiling, since it's not starting.");
   // Mark the profiling stop.
-  Mark("_stop_profiler_");
-  kState = ProfilerState::kDisabled;
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
   std::vector<std::vector<Event>> result;
-  std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-  for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) {
-    auto& list = *it;
-    result.emplace_back(list->Reduce());
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
   }
   return result;
 }
 
-void PushEvent(const std::string name, const platform::DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
                         dev_ctx);
 }
 
-void PopEvent(const std::string name, const platform::DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, std::move(name), kThreadId,
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, std::move(name), g_thread_id,
                         dev_ctx);
 }
 
-void ParseEvents(std::vector<std::vector<Event>> events) {
+void ParseEvents(std::vector<std::vector<Event>>& events) {
+  // Event name :: counts :: ave  ::  min   ::  max :: total
   std::map<std::string, std::tuple<int, double, double>> events_table;
   for (size_t i = 0; i < events.size(); i++) {
     std::list<Event> pushed_events;
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index 5f21ff8c1c39323b5477607fc74e08b43420de23..eb36355f86d65021809976f4e6119800664790f7 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -24,76 +24,24 @@ namespace platform {
 
 enum EventKind { kMark, kPushRange, kPopRange };
 
-inline uint64_t GetTimeInNsec() {
-  // using std::chrono;
-  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock,
-                                 std::chrono::steady_clock>::type;
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             clock::now().time_since_epoch())
-      .count();
-}
-
 class Event {
  public:
-  // the DeviceContext is used to get the cuda stream.
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
   Event(EventKind kind, std::string name, uint32_t thread_id,
-        const platform::DeviceContext* dev_ctx = nullptr)
-      : kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
-    has_cuda_ = false;
-#ifdef PADDLE_WITH_CUDA
-    auto* cuda_dev_ctx =
-        static_cast<const platform::CUDADeviceContext*>(dev_ctx);
-    if (cuda_dev_ctx) {
-      PADDLE_ENFORCE(cudaGetDevice(&device_));
-      PADDLE_ENFORCE(cudaEventCreate(&event_));
-      auto stream = cuda_dev_ctx->stream();
-      PADDLE_ENFORCE(cudaEventRecord(event_, stream));
-      has_cuda_ = true;
-    }
-#endif
-    cpu_ns_ = GetTimeInNsec();
-  }
-
-  std::string kind() const {
-    switch (kind_) {
-      case EventKind::kMark:
-        return "mark";
-      case EventKind::kPushRange:
-        return "push";
-      case EventKind::kPopRange:
-        return "pop";
-    }
-    PADDLE_THROW("Unknown EventKind.");
-  }
+        DeviceContext* dev_ctx);
 
+  std::string kind() const;
   std::string name() const { return name_; }
-
   bool has_cuda() const { return has_cuda_; }
 
 #ifdef PADDLE_WITH_CUDA
   cudaEvent_t event() const { return event_; }
-
   int device() const { return device_; }
 #endif
 
-  double CpuElapsedUs(const Event& e) const {
-    return (e.cpu_ns_ - cpu_ns_) / (1000.0);
-  }
-
-  double CudaElapsedUs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(e.has_cuda() && has_cuda());
-    PADDLE_ENFORCE(e.device() == device());
-    PADDLE_ENFORCE(cudaEventSynchronize(event_));
-    PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
-    float ms;
-    PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-    return ms * 1000.0;
-#else
-    PADDLE_THROW("CUDA is not enabled");
-#endif
-  }
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
 
  private:
   EventKind kind_;
@@ -108,11 +56,11 @@ class Event {
 };
 
 struct EventList {
-  constexpr static std::size_t kMB = 1024 * 1024;
-  constexpr static std::size_t kEventBlockSize = 16 * kMB;
-  constexpr static std::size_t kEventSize = sizeof(Event);
-  constexpr static std::size_t kEventAlign = alignof(Event);
-  constexpr static std::size_t kNumBlock =
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
       kEventBlockSize /
       ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
 
@@ -139,69 +87,36 @@ struct EventList {
 };
 
 enum ProfilerState {
-  kDisabled,
-  kCPU,
-  kCUDA,
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
 };
 
-// The profiler state, the initial value is ProfilerState::kDisabled
-extern ProfilerState kState;
-// The global mutex
-extern std::mutex kAllEventListsMutex;
-// The total event lists of all threads
-extern std::list<std::shared_ptr<EventList>> kAllEventLists;
-// The thread local event list only can be accessed by the specific thread
-extern thread_local std::shared_ptr<EventList> kEventList;
-// The thread index of each thread
-extern thread_local int32_t kThreadId;
-// The kNextThreadId is a global counter for threads, by the kThreadId and
-// kNextThreadId, we can know how many threads have created EventList.
-extern uint32_t kNextThreadId;
-
-inline EventList& GetEventList() {
-  if (!kEventList) {
-    std::lock_guard<std::mutex> guard(kAllEventListsMutex);
-    kEventList = std::make_shared<EventList>();
-    kThreadId = kNextThreadId++;
-    kAllEventLists.emplace_front(kEventList);
-  }
-  return *kEventList;
-}
-
-inline void Mark(const std::string name,
-                 const platform::DeviceContext* dev_ctx = nullptr) {
-  GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
-}
+void Mark(const std::string& name, DeviceContext* dev_ctx);
 
-void PushEvent(const std::string name,
-               const platform::DeviceContext* dev_ctx = nullptr);
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);
 
-void PopEvent(const std::string name,
-              const platform::DeviceContext* dev_ctx = nullptr);
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);
 
 struct RecordEvent {
-  explicit RecordEvent(const std::string name,
-                       platform::DeviceContext* dev_ctx = nullptr) {
-    if (kState == ProfilerState::kDisabled) return;
-    dev_ctx_ = dev_ctx;
-    name_ = name;
-    GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
-                          dev_ctx_);
-  }
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
 
-  ~RecordEvent() {
-    if (kState == ProfilerState::kDisabled) return;
-    GetEventList().Record(EventKind::kPopRange, std::move(name_), kThreadId,
-                          dev_ctx_);
-  }
-  platform::DeviceContext* dev_ctx_;
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
+  // Event name
   std::string name_;
 };
 
+// Enable the profiling function.
 void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();
 
-void ParseEvents(std::vector<std::vector<Event>>);
+void ParseEvents(std::vector<std::vector<Event>>&);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
index b2f1dea46596cd9f302dea8586931016bd56f75d..ff5e658f5bd55062ef0c25fce9920e725df9959b 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
   using paddle::platform::EventKind;
 
-  Event start_event(EventKind::kPushRange, "test", 0);
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
   EXPECT_TRUE(start_event.has_cuda() == false);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0);
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
   EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
 }
 
@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
 TEST(Event, CudaElapsedTime) {
   using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
   using paddle::platform::Event;
   using paddle::platform::EventKind;
 
-  DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0));
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
   Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
   EXPECT_TRUE(start_event.has_cuda() == true);
   int counter = 0;
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
   DeviceContext* dev_ctx = nullptr;
 #ifdef PADDLE_WITH_CUDA
   using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
   state = ProfilerState::kCUDA;
   dev_ctx =
-      new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0));
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
 #endif
   EnableProfiler(state);
 
@@ -98,7 +98,9 @@ TEST(RecordEvent, RecordEvent) {
   int cuda_startup_count = 0;
   int start_profiler_count = 0;
   int stop_profiler_count = 0;
+
   ParseEvents(events);
+
   for (size_t i = 0; i < events.size(); ++i) {
     for (size_t j = 0; j < events[i].size(); ++j) {
       if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 6afed7eec7001b646d55cef0bc3f59782b80b15f..7b374307071d2da91a677361b404448f1a3816b0 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -3,6 +3,9 @@ if(WITH_PYTHON)
     SRCS pybind.cc exception.cc protobuf.cc const_value.cc
     DEPS pybind python backward proto_desc paddle_memory executor prune init
     ${GLOB_OP_LIB})
+  if(NOT APPLE AND NOT ANDROID)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT ANDROID)
 endif(WITH_PYTHON)
 
 if(WITH_DOC)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index f105370f226e2cceaac685f280d55134d4291028..07292d47e9c165c67fe4a30ee7d851c350beb2e0 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
              std::string name = byte_name;
              return self.HasVar(name);
            })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
       .def("find_var",
            [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
            py::return_value_policy::reference)
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
+           py::return_value_policy::reference)
       .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
       .def("op_size", &BlockDesc::OpSize)
       .def("op", &BlockDesc::Op, py::return_value_policy::reference)
@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
       .def("set_shape", &VarDesc::SetShape)
       .def("set_dtype", &VarDesc::SetDataType)
       .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
       .def("lod_level", &VarDesc::GetLodLevel)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
       .def("type", &VarDesc::GetType)
@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
       .value("BLOCK", proto::AttrType::BLOCK);
 
   py::class_<OpDesc> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDesc::Type)
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
       .def("set_type", &OpDesc::SetType)
       .def("input", &OpDesc::Input)
       .def("input_names", &OpDesc::InputNames)
-      .def("set_input", &OpDesc::SetInput)
       .def("output", &OpDesc::Output)
       .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
       .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
       .def("has_attr", &OpDesc::HasAttr)
       .def("attr_type", &OpDesc::GetAttrType)
       .def("attr_names", &OpDesc::AttrNames)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 07e38476e68b79f5b3192c619c89cd0e061cc686..04485ce7c1ab87f8655b0e6cbaecc36b3382f647 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
-  m.def("get_grad_op_descs",
-        [](const OpDesc &op_desc,
-           const std::unordered_set<std::string> &no_grad_set,
-           std::unordered_map<std::string, std::string> &grad_to_var,
-           const std::vector<BlockDesc *> &grad_sub_block) {
-          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-              framework::OpInfoMap::Instance()
-                  .Get(op_desc.Type())
-                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                                 grad_sub_block);
-          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-          std::transform(
-              grad_op_descs.begin(), grad_op_descs.end(),
-              grad_op_desc_ptrs.begin(),
-              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-          return grad_op_desc_ptrs;
-        });
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
+                             const std::unordered_set<std::string> &no_grad_set,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+            framework::OpInfoMap::Instance()
+                .Get(op_desc.Type())
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                               grad_sub_block);
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+      });
   m.def("prune", [](const ProgramDesc &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
     return new ProgramDesc(pruned_desc);
   });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 67244d82602906231ac1fc870adccc7e82869407..6b4290972bade585d1a0c2ae919a2e712bdf308c 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
         auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Borrow(tensor.place()));
+            pool.Get(tensor.place()));
 
         paddle::platform::GpuMemcpyAsync(
             dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
@@ -76,10 +77,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
       } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
-      return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
-          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(dst_tensor.dims()),
+                             dims_outside, strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
 
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
 }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e43b9c218a3ecb9e7f20fb7e8b14a85a29947eef..92039ec6b05d224e702f0ba5dc05c057a492287e 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -178,7 +178,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index a94bc01b358c508132eb85920a2d4c0aa934dd51..8a352b0078d701f797f7202c85bd0e08201ac9b8 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -71,9 +71,7 @@ function threads_config() {
   # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
   # according to trainer_count and total processors
   # only when MKL enabled
-  if [ "@WITH_MKL@" == "OFF" ]; then
-    return 0
-  fi
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
   processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
   trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
   if [ -z $trainers ]; then
@@ -83,12 +81,19 @@ function threads_config() {
   if [ $threads -eq 0 ]; then
     threads=1
   fi
-  if [ -z "$OMP_NUM_THREADS" ]; then
-    export OMP_NUM_THREADS=$threads
-  fi
-  if [ -z "$MKL_NUM_THREADS" ]; then
-    export MKL_NUM_THREADS=$threads
+  if [ "@WITH_MKL@" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
   fi
+  
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
@@ -150,7 +155,7 @@ fi
 case "$1" in
     "train")
         threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
         ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 634388094c804827657dc83d5c205e680625b156..7bdddeaabec733ef26b3f766c6437f5c53d65044 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index c72b5730695dbc4f772015f1fb8dec6814cd1837..225b41c5043b5792abb90bbad53cbbfce9a3156e 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
     """
     import sys
     import core
-    read_env_flags = ['use_pinned_memory']
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf']
     if core.is_compile_gpu():
         read_env_flags.append('fraction_of_gpu_memory_to_use')
     core.init_gflags([sys.argv[0]] +
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index f188582178f667125ec95cd230100fdb10ce7e88..ac60bf543600008fd5339c1a378951374afc4ad6 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,54 +1,338 @@
 from paddle.v2.fluid import framework as framework
+from . import core
+import collections
 
-__all__ = ['append_backward_ops']
+__all__ = ['append_backward']
 
 
-def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
     """
-    Create and add gradient Operators in BlockDesc to compute
-    gradients of `loss` for parameters in parameter_list
+    Traverse all ops in op_descs[begin_idx : end_idx], 
+    if any op has inputs/outputs named "old_name", rename it as 'new_name'
+    """
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_descs)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_descs[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+
+
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    """
+    Create a C++ OpDesc object with specified inputs, outputs and attributes.
+    """
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+
+
+def _infer_var_data_type_(grad_var_name, block):
+    """
+    Infer the data type of given grad variable
+    """
+    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+
+
+def _all_in_set_(cands, s):
+    """
+    Test if all elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+
+
+def _strip_grad_suffix_(name):
+    """
+    Strip the grad suffix from the given varibale name
+    e.g. x@GRAD ==> x
+         y@GRAD@RENAME@1 ==> y
+    """
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+
+
+def _append_grad_suffix_(name):
+    """
+    Append grad suffix to the given variable name
+    e.g. x ==> x@GRAD
+    """
+    return name + core.grad_var_suffix()
+
+
+def _addup_repetitive_outputs_(op_descs):
+    """
+    In backward part, an variable may be the output of more than one ops.
+    In this case, the variable should be the accumulation of all the outputs.
+    `sum_op`s are added to implement the accumulate.
+    """
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+
+    return op_descs
+
 
-    :param loss: an variable generated by cost function.
-    :type loss: Variable
-    :param no_grad_set: variable that should not create gradient
-    :type no_grad_set: set
-    :param parameter_list: parameters that need to compute gradient and 
-    update to optimize the lost.
-    :type: list
-    :return: list of (parameters, gradients) pair.
-    :rtype: list[Variable]
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    """
+    Remove unnecessary grad ops
+    A grad op can be removed in two cases:
+        1. all outputs of the grad op are in 'no_grad_set'
+        2. all grad inputs of the grad op are in 'no_grad_set'
+    """
+
+    def _op_can_be_removed_(op_desc, no_grad_set):
+        out_arg_names = op_desc.output_arg_names()
+        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
+            return True
+        if _all_in_set_(
+                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                       op_desc.input_arg_names()), no_grad_set):
+            no_grad_set.union(out_arg_names)
+            return True
+        return False
+
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Y": [arg]}, {}), idx))
+
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+
+    return op_descs
+
+
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+    # grad_op_descs holds created grad_op, and will be appended to target_block
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(block.ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(target, sub_block, grad_sub_block,
+                                  no_grad_dict, grad_to_var, callback)
+            grad_sub_block_list.append(grad_sub_block.desc)
+
+        # Getting op's corresponding grad_op
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+
+    if target_block.idx == 0:
+        grad_op_descs.insert(
+            0,
+            _create_op_desc_("fill_constant", {}, {
+                "Out": [_append_grad_suffix_(target.name)]
+            }, {"shape": [1],
+                "value": 1.0,
+                "dtype": target.dtype}))
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+
+
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by cost function.
+        parameter_list(list): Parameters that need to be updated by optimizer.
+            If None, it means all parameters need to be updated.
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
     """
     assert isinstance(loss, framework.Variable)
 
+    program = loss.block.program
+    no_grad_dict = dict()
     if no_grad_set is None:
-        program = loss.block.program
         assert isinstance(program, framework.Program)
-        no_grad_set = list()
         for block in program.blocks:
             assert isinstance(block, framework.Block)
+            block_no_grad_set = set()
             for var in block.vars.itervalues():
                 assert isinstance(var, framework.Variable)
                 if var.stop_gradient:
-                    no_grad_set.append(var.name)
-        no_grad_set = set(no_grad_set)
+                    block_no_grad_set.add(_append_grad_suffix_(var.name))
+            no_grad_dict[block.idx] = block_no_grad_set
+    elif isinstance(no_grad_set, set):
+        no_grad_dict = {
+            0: set([_append_grad_suffix_(name) for name in no_grad_set])
+        }
+    else:
+        raise ValueError("'no_grad_set' should be a set or None.")
+
+    grad_info_map = dict()
+    root_block = program.block(0)
+
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+
+    _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
+                          grad_to_var)
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()
 
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
     if parameter_list is not None:
         parameters = parameter_list
     else:
-        params = loss.block.program.global_block().all_parameters()
+        params = program.global_block().all_parameters()
         parameters = [param.name for param in params]
     params_and_grads = []
     for param in parameters:
-        if param not in param_grad_map:
+        if param not in grad_info_map:
             raise ValueError("param %s is not in map" % param)
-        grad_info = param_grad_map[param]
-        grad_block = loss.block.program.block(grad_info[1])
+        grad_info = grad_info_map[param]
+        grad_block = grad_info[1]
         if not grad_block.has_var(grad_info[0]):
             raise ValueError("grad block[{0}] did not have grad var {1}".format(
                 grad_info[1], grad_info[0]))
         # Get the param var from the global block
-        param_var = loss.block.program.global_block().var(param)
+        param_var = program.global_block().var(param)
         grad_var = grad_block.var(grad_info[0])
         if loss.block.has_var(grad_info[0]):
             params_and_grads.append((param_var, grad_var))
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py
index 30a542af212926c93381aade426e25f2117e4662..24036c3e75b9594ba58cccb02825ab8020d1e107 100644
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -3,7 +3,7 @@ import core
 import numpy
 import six.moves as six
 
-from framework import Variable
+from framework import Variable, default_main_program
 
 __all__ = ['DataFeeder']
 
@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
 
 
 class DataFeeder(object):
-    def __init__(self, feed_list, place):
+    def __init__(self, feed_list, place, program=None):
         self.feed_dtypes = []
         self.feed_names = []
         self.feed_shapes = []
         self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
         for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
             if not isinstance(each_var, Variable):
                 raise TypeError("Feed list should contain a list of variable")
             self.feed_dtypes.append(each_var.dtype)
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 111937f59c3ab05e5917a79ca7e1f81f59747fc3..49ece7b725e318d7526d58fe54c97cbe20200a7d 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -95,7 +95,9 @@ class DistributeTranspiler:
         """
         if program is None:
             program = default_main_program()
+        self.program = program
         self.trainers = trainers
+        self.optimize_ops = optimize_ops
         self._optimize_distributed(
             optimize_ops,
             program,
@@ -156,9 +158,10 @@ class DistributeTranspiler:
             attrs={"endpoints": pserver_endpoints,
                    "epmap": epmap})
 
-    def get_trainer_program(optimize_ops, program):
+    def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        program.global_block().delete_ops(optimize_ops)
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
 
     def _create_var_for_trainers(self, block, var, trainers):
         var_list = []
@@ -210,7 +213,6 @@ class DistributeTranspiler:
 
             if opt_op.inputs.has_key("Grad"):
                 if opt_op.inputs["Grad"].name in grad_var_names:
-                    print "appending ", opt_op.type, opt_op.inputs
                     optimize_sub_program.global_block().append_op(
                         type=opt_op.type,
                         inputs=opt_op.inputs,
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 2c91afb363bf72f2791e60c6df0d9130ccd698c5..1d6c594b41a2c295e3818fb119362d1daba1de33 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,12 +1,31 @@
 import numpy as np
+import contextlib
+from framework import Program, default_main_program
 from . import core
-from framework import Program, default_main_program, Parameter, Variable
 
-__all__ = ['Executor', 'g_scope']
+__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
 
 g_scope = core.Scope()
 
 
+def global_scope():
+    return g_scope
+
+
+def switch_scope(scope):
+    global g_scope
+    ex = g_scope
+    g_scope = scope
+    return ex
+
+
+@contextlib.contextmanager
+def scope_guard(scope):
+    ex = switch_scope(scope)
+    yield
+    switch_scope(ex)
+
+
 def as_numpy(tensor):
     if isinstance(tensor, list):
         return [as_numpy(t) for t in tensor]
@@ -117,7 +136,7 @@ class Executor(object):
             raise TypeError()
 
         if scope is None:
-            scope = g_scope
+            scope = global_scope()
 
         program = program.clone()
         global_block = program.global_block()
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index add854306ea7fa527943de871d2716cd2aa9f530..b66a8bce5f4f15539007876c113afd3f878b00bc 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -663,7 +663,7 @@ class Block(object):
             end = list(self.ops).index(ops[-1])
         except Exception, e:
             raise e
-        self.desc.remove_op(start, end)
+        self.desc.remove_op(start, end + 1)
 
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
@@ -846,9 +846,11 @@ class Program(object):
         self.sync_with_cpp()
         return param_to_grad_info
 
-    def create_block(self):
+    def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
         self.current_block_idx = new_block_idx
         self.blocks.append(Block(self, self.current_block_idx))
         return self.current_block()
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 69a732fc45a1946f260cdd9a9c2da150b87c3ddd..c47ce82aba7fa5ac42ac26cd25fa3ebc93e96cb2 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -188,7 +188,7 @@ def save_inference_model(dirname,
             raise ValueError("'feed_var_names' should be a list of str.")
 
     if isinstance(target_vars, Variable):
-        feeded_var_names = [feeded_var_names]
+        target_vars = [target_vars]
     else:
         if not (bool(target_vars) and all(
                 isinstance(var, Variable) for var in target_vars)):
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 22a37c22c3fc777cadcdee6632bbf1fb558fef70..acc22bef98b6eac4291bb2181e6d5cd7dbe2a768 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -16,6 +16,36 @@ __all__ = [
 
 
 def split_lod_tensor(input, mask, level=0):
+    """
+    **split_lod_tensor**
+
+    This function takes in an input that contains the complete lod information,
+    and takes in a mask which is used to mask certain parts of the input.
+    The output is the true branch and the false branch with the mask applied to
+    the input at a certain level in the tensor.
+
+    Args:
+        input(tuple|list|None): The input tensor that contains complete
+                                lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The true branch of tensor as per the mask applied to input.
+        Variable: The false branch of tensor as per the mask applied to input.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(name='x', shape=[1])
+          x.persistable = True
+
+          y = layers.data(name='y', shape=[1])
+          y.persistable = True
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+    """
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
     out_false = helper.create_tmp_variable(dtype=input.dtype)
@@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0):
 
 
 def merge_lod_tensor(in_true, in_false, x, mask, level=0):
+    """
+    **merge_lod_tensor**
+
+    This function takes in an input :math:`x`, the True branch, the False
+    branch and a binary :math:`mask`. Using this information, this function
+    merges the True and False branches of the tensor into a single Output
+    at a certain lod level indiacted by :math:`level`.
+
+    Args:
+        in_true(tuple|list|None): The True branch to be merged.
+        in_false(tuple|list|None): The False branch to be merged.
+        x(tuple|list|None): The input tensor that contains complete
+                            lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The merged output tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(
+                      name='x', shape=[1], dtype='float32', stop_gradient=False)
+          y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+          level = 0
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+          out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+    """
     helper = LayerHelper('merge_lod_tensor', **locals())
     out = helper.create_tmp_variable(dtype=in_true.dtype)
     helper.append_op(
@@ -397,9 +461,50 @@ class While(object):
 
 
 def lod_rank_table(x, level=0):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
+    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
+    contains a list of bi-element tuples. Each tuple consists of an index and
+    a length, both of which are int type. Reffering to specified level of LoD,
+    the index is the sequence index number and the length representes the
+    sequence length. Please note that the list is ranked in descending order by
+    the length. The following is an example:
+
+        .. code-block:: text
+
+            x is a LoDTensor:
+                x.lod = [[0,                2, 3],
+                         [0,             5, 6, 7]]
+                x.data = [a, b, c, d, e, f, g]
+
+            1. set level to 0:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=0)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
+
+            2. set level to 1:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=1)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
+
+    Args:
+        x (Variable): Input variable, a LoDTensor based which to create the lod
+            rank table.
+        level (int): Specify the LoD level, on which to create the lod rank
+            table.
+
+    Returns:
+        Variable: The created LoDRankTable object.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            out = layers.lod_rank_table(x=x, level=0)
     """
     helper = LayerHelper("lod_rank_table", **locals())
     table = helper.create_variable(
@@ -414,9 +519,25 @@ def lod_rank_table(x, level=0):
 
 
 def max_sequence_len(rank_table):
-    """
-    This function creates an operator to calculate the length of
-    max seqence through input rank_table(should be a lod_rank_table)
+    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    object contains a list of tuples(<sequence index, sequence length>) and
+    the list is already sorted by sequence length in descending order, so the
+    operator just returns the sequence length of the first tuple element.
+
+    Args:
+        rank_table (Variable): Input variable which is a LoDRankTable object.
+
+    Returns:
+        Variable: The max length of sequence.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            rank_table = layers.lod_rank_table(x=x, level=0)
+            max_seq_len = layers.max_sequence_len(rank_table)
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")
@@ -428,6 +549,30 @@ def max_sequence_len(rank_table):
 
 
 def topk(input, k):
+    """
+    **topk**
+
+    This function performs the operation that selects the k entries in the input
+    vector and outputs their values and indices as vectors. Thus topk_out[j] is
+    the j-th largest entry in input, and its index is topk_indices[j]
+
+    Args:
+        input (Variable|list): The input tensor that has all the data.
+        k (int): The number of top elements that the function will pick.
+
+    Returns:
+        Variable: The variable of type array that contains the k largest entries
+                  from input.
+        Variable: The variable of type array that contains the indices of k
+                  largest entries from input.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          k = 5
+          array = fluid.layers.topk(x, k)
+    """
     helper = LayerHelper('topk', **locals())
     topk_out = helper.create_tmp_variable(dtype=input.data_type)
     topk_indices = helper.create_tmp_variable(dtype='int64')
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 2a462ee6cb47496b17a8f584e56ac2c8934b319a..1c1c09dd28b7563cfb340370a495d7e7c66988cc 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
 
     Args:
        input(Variable): Input to the function
-       size(tuple|list|None): Shape of the look up table parameter 
+       size(tuple|list|None): Shape of the look up table parameter
        is_sparse(bool): Boolean flag that specifying whether the input is sparse
        param_attr(ParamAttr): Parameters for this layer
        dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
@@ -270,6 +270,7 @@ def gru_unit(input,
             attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
     # create bias
+
     if bias is None:
         bias_size = [1, 3 * size]
         bias = helper.create_parameter(
@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
 
 def cross_entropy(input, label, **kwargs):
     """
-    This function computes cross_entropy using the input and label.
+    **Cross Entropy Layer**
+
+    This layer computes the cross entropy between `input` and `label`. It supports
+    both standard cross-entropy and soft-label cross-entropy loss computation.
+
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+
+        .. math::
+
+            Y[i] = -\log(X[i, Label[i]])
+
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+
+        .. math::
+
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+            batch size and D is the number of classes. This input is a probability
+            computed by the previous operator, which is almost always the result
+            of a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape
+              [N x 1]. When `soft_label` is set to `True`, `label` is a
+              tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
+              the given labels as soft labels, default `False`.
+
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
+              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
+               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+
+    Examples:
+        .. code-block:: python
+
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
     helper = LayerHelper('cross_entropy', **kwargs)
     out = helper.create_tmp_variable(dtype=input.dtype)
@@ -373,8 +426,36 @@ def cross_entropy(input, label, **kwargs):
 
 def square_error_cost(input, label, **kwargs):
     """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
+    **Square error cost layer**
+
+    This layer accepts input predictions and target label and returns the squared error cost.
+    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
+
+    .. math::
+
+        Out = (X - Y)^2
+
+    In the above equation:
+
+        * :math:`X`: Input predictions, a tensor.
+        * :math:`Y`: Input labels, a tensor.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    Args:
+       input(Variable): Input tensor, has predictions.
+       label(Variable): Label tensor, has target labels.
+
+    Returns:
+        Variable: The tensor variable storing the element-wise squared error difference \
+                  of input and label.
+
+    Examples:
+        .. code-block:: python
+
+          y = layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = layers.square_error_cost(input=y_predict, label=y)
+
     """
     helper = LayerHelper('square_error_cost', **kwargs)
     minus_out = helper.create_tmp_variable(dtype=input.dtype)
@@ -514,14 +595,83 @@ def conv2d(input,
            groups=None,
            param_attr=None,
            bias_attr=None,
-           act=None,
-           name=None):
+           act=None):
     """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
+    **Convlution2D Layer**
+
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
+    of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the output of the convolution,
+    and the corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+        * :math:`X`: Input value, a tensor with NCHW format.
+        * :math:`W`: Filter value, a tensor with MCHW format.
+        * :math:`\\ast`: Convolution operation.
+        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+        * :math:`\\sigma`: Activation function.
+        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        Input:
+            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        Output:
+            Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+    .. math::
+
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        groups(int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act(str): Activation type. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     if stride is None:
@@ -577,9 +727,9 @@ def conv2d(input,
 
 def sequence_pool(input, pool_type, **kwargs):
     """
-    This function add the operator for sequence pooling. 
-    It pools features of all time-steps of each instance, and is applied 
-    on top of the input using pool_type mentioned in the parameters. 
+    This function add the operator for sequence pooling.
+    It pools features of all time-steps of each instance, and is applied
+    on top of the input using pool_type mentioned in the parameters.
 
     It supports four pool_type:
 
@@ -608,7 +758,7 @@ def sequence_pool(input, pool_type, **kwargs):
 
     Args:
         input(variable): The input variable which is a LoDTensor.
-        pool_type (string): The pooling type of sequence_pool. 
+        pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
 
     Returns:
@@ -618,7 +768,7 @@ def sequence_pool(input, pool_type, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
              sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
@@ -666,7 +816,7 @@ def sequence_first_step(input, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              x_first_step = fluid.layers.sequence_first_step(input=x)
     """
@@ -699,7 +849,7 @@ def sequence_last_step(input, **kwargs):
 
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1], 
+             x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              x_last_step = fluid.layers.sequence_last_step(input=x)
     """
@@ -1018,25 +1168,26 @@ def lstm_unit(x_t,
 
         .. math::
 
-            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
 
-            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
 
-            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
 
-            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
 
             h_t & = o_t tanh(c_t)
 
-    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
-    :math:`c_{t-1}`. The implementation separates the linear transformation
-    and non-linear transformation apart. Here, we take :math:`i_t` as an
-    example. The linear transformation is applied by calling a `fc` layer and
-    the equation is:
+    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
+    should be same. The implementation separates the linear transformation and
+    non-linear transformation apart. Here, we take :math:`i_t` as an example.
+    The linear transformation is applied by calling a `fc` layer and the
+    equation is:
 
         .. math::
 
-            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
 
     The non-linear transformation is applied by calling `lstm_unit_op` and the
     equation is:
@@ -1048,9 +1199,12 @@ def lstm_unit(x_t,
     This layer has two outputs including :math:`h_t` and :math:`o_t`.
 
     Args:
-        x_t (Variable): The input value of current step.
-        hidden_t_prev (Variable): The hidden value of lstm unit.
-        cell_t_prev (Variable): The cell value of lstm unit.
+        x_t (Variable): The input value of current step, a 2-D tensor with shape
+            M x N, M for batch size and N for input size.
+        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
+            with shape M x S, M for batch size and S for size of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
+            shape M x S, M for batch size and S for size of lstm unit.
         forget_bias (float): The forget bias of lstm unit.
         param_attr (ParamAttr): The attributes of parameter weights, used to set
             initializer, name etc.
@@ -1063,14 +1217,15 @@ def lstm_unit(x_t,
     Raises:
         ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
                 not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
-                and **cell_t_prev** not be the same.
+                and **cell_t_prev** not be the same or the 2nd dimensions of \
+                **hidden_t_prev** and **cell_t_prev** not be the same.
 
     Examples:
 
         .. code-block:: python
 
              x_t = fluid.layers.fc(input=x_t_data, size=10)
-             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
              prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
              hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
                                                     hidden_t_prev=prev_hidden,
@@ -1089,7 +1244,11 @@ def lstm_unit(x_t,
 
     if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
             0] != cell_t_prev.shape[0]:
-        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
                          "cell_t_prev must be the same.")
 
     if bias_attr is None:
@@ -1118,17 +1277,17 @@ def lstm_unit(x_t,
 
 def reduce_sum(input, dim=None, keep_dim=False):
     """
-    Computes the sum of tensor elements over the given dimension. 
+    Computes the sum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the sum is performed. If 
-            :attr:`None`, sum all elements of :attr:`input` and return a 
-            Tensor variable with a single element, otherwise must be in the 
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+        dim (int|None): The dimension along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`input` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
             the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
@@ -1162,17 +1321,17 @@ def reduce_sum(input, dim=None, keep_dim=False):
 
 def reduce_mean(input, dim=None, keep_dim=False):
     """
-    Computes the mean of tensor elements over the given dimension. 
+    Computes the mean of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the mean is computed. If 
-            :attr:`None`, compute the mean over all elements of :attr:`input` 
-            and return a Tensor variable with a single element, otherwise 
-            must be in the range :math:`[-rank(input), rank(input))`. If 
+        dim (int|None): The dimension along which the mean is computed. If
+            :attr:`None`, compute the mean over all elements of :attr:`input`
+            and return a Tensor variable with a single element, otherwise
+            must be in the range :math:`[-rank(input), rank(input))`. If
             :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
@@ -1206,22 +1365,22 @@ def reduce_mean(input, dim=None, keep_dim=False):
 
 def reduce_max(input, dim=None, keep_dim=False):
     """
-    Computes the maximum of tensor elements over the given dimension. 
+    Computes the maximum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the maximum is computed. 
-            If :attr:`None`, compute the maximum over all elements of 
-            :attr:`input` and return a Tensor variable with a single element, 
-            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+        dim (int|None): The dimension along which the maximum is computed.
+            If :attr:`None`, compute the maximum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
             If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
         Variable: The reduced Tensor variable.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1250,22 +1409,22 @@ def reduce_max(input, dim=None, keep_dim=False):
 
 def reduce_min(input, dim=None, keep_dim=False):
     """
-    Computes the minimum of tensor elements over the given dimension. 
+    Computes the minimum of tensor elements over the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the minimum is computed. 
-            If :attr:`None`, compute the minimum over all elements of 
-            :attr:`input` and return a Tensor variable with a single element, 
-            otherwise must be in the range :math:`[-rank(input), rank(input))`. 
+        dim (int|None): The dimension along which the minimum is computed.
+            If :attr:`None`, compute the minimum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
             If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the 
-            output Tensor. The result tensor will have one fewer dimension 
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
 
     Returns:
         Variable: The reduced Tensor variable.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index e5820d24cd2b34ef53cbb91e2be66efc1b74d315..9ce25a9e0831a49ef3bbc5026181856e6c4cdfcc 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input,
 
 def ones(shape, dtype):
     """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
+    **ones**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 1.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.ones(shape=[1], dtype='int64')
     """
     return fill_constant(value=1.0, **locals())
 
 
 def zeros(shape, dtype):
     """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
+    **zeros**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 0.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.zeros(shape=[1], dtype='int64')
     """
     return fill_constant(value=0.0, **locals())
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index c56a531ed531cf0219e94854ba66c7399e003292..ff3e5315a2c2b115e4ba563f60de4139f248e93a 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 
 import framework
-from backward import append_backward_ops
+from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
@@ -194,10 +194,10 @@ class Optimizer(object):
                  no_grad_set=None):
         """Add operations to minimize `loss` by updating `parameter_list`.
 
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+        params_grads = append_backward(loss, parameter_list, no_grad_set)
 
         params_grads = append_gradient_clip_ops(params_grads)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index c3591a613acafb268a5bd70618cd4555450bac29..8acd470c5ed5fa8eeda396f1e9182db4ecdd7016 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -170,7 +170,7 @@ def main():
 
     exe.run(fluid.default_startup_program())
 
-    embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor()
+    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
     embedding_param.set(
         load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
 
diff --git a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
similarity index 76%
rename from python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
rename to python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
index 2680502efb91061be37a77fbe5b451960fdd15f7..20b4a8b34cd085ae51e6169f0d4eac58b7f3ffb2 100644
--- a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -38,35 +38,43 @@ train_reader = paddle.batch(
 
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+
 t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
 pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
 training_role = os.getenv("TRAINING_ROLE",
                           "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 
 if training_role == "PSERVER":
-    pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
     exe.run(fluid.default_startup_program())
     exe.run(pserver_prog)
 elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
     feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
     exe.run(fluid.default_startup_program())
 
     for pass_id in range(PASS_NUM):
         accuracy.reset(exe)
+        batch_id = 0
         for data in train_reader():
-            loss, acc = exe.run(fluid.default_main_program(),
+            loss, acc = exe.run(trainer_prog,
                                 feed=feeder.feed(data),
                                 fetch_list=[avg_cost] + accuracy.metrics)
             pass_acc = accuracy.eval(exe)
-            # print loss, acc
-            if loss < 10.0 and pass_acc > 0.9:
-                # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-                exit(0)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
 
         pass_acc = accuracy.eval(exe)
         print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
 else:
     print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/decorators.py b/python/paddle/v2/fluid/tests/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..154619b0e93455922700a12d734967b4d20c4f13
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/decorators.py
@@ -0,0 +1,29 @@
+import paddle.v2.fluid as fluid
+
+__all__ = ['many_times', 'prog_scope']
+
+
+def many_times(times):
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            for _ in range(times):
+                fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
+
+
+def prog_scope():
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            prog = fluid.Program()
+            startup_prog = fluid.Program()
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(prog, startup_prog):
+                    fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 8dbfbd547a6677517f028997e6269709aac43b67..b77d2b1268f27c5ec3c34839aaad9b75f0132c2e 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -4,7 +4,7 @@ import random
 import itertools
 import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.framework import Program, OpProtoHolder
@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
             op_loss.desc.infer_var_type(block.desc)
             op_loss.desc.infer_shape(block.desc)
 
-        param_grad_list = append_backward_ops(
+        param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
         feed_dict = {
diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
index a0d6655d4cbcff8ed3d55df0f4e68fc6591fbb11..7dbc2fa0858a68c5da9e8d48dcb187494357e940 100644
--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
@@ -1,6 +1,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.v2.fluid import core
+from paddle.v2.fluid.op import Operator
 
 
 class TestAdamOp1(OpTest):
@@ -176,5 +178,124 @@ def adam_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    # grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = np.zeros(shape=[height, row_numel])
+    moment2_out = np.zeros(shape=[height, row_numel])
+    param_out = np.zeros(shape=[height, row_numel])
+
+    for idx, row_id in enumerate(rows):
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
+                                                         ) * np_grad[idx]
+        moment2_out[row_id] = beta2 * moment2[row_id] + (
+            1 - beta2) * np.square(np_grad[idx])
+        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
+            np.sqrt(moment2_out[row_id]) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestSparseAdamOp(unittest.TestCase):
+    def setup(self, scope, place):
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+
+        height = 10
+        rows = [0, 4, 7]
+        self.rows = rows
+        row_numel = 12
+        self.row_numel = row_numel
+        self.dense_inputs = {
+            "Param": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
+            'Beta1Pow': np.array([beta1**10]).astype("float32"),
+            'Beta2Pow': np.array([beta2**10]).astype("float32"),
+            "LearningRate": np.full((1), 2.0).astype("float32")
+        }
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        self.sparse_inputs = ["Grad"]
+
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+        self.outputs = {
+            "ParamOut": param_out,
+            "Moment1Out": mom1,
+            "Moment2Out": mom2
+        }
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.setup(scope, place)
+
+        op_args = dict()
+        for key, np_array in self.dense_inputs.iteritems():
+            var = scope.var(key).get_tensor()
+            var.set(np_array, place)
+            op_args[key] = key
+        for s in self.sparse_inputs:
+            op_args[s] = s
+        for s in self.outputs:
+            var = scope.var(s).get_tensor()
+            var.set(self.outputs[s], place)
+            op_args[s] = s
+        for k in self.attrs:
+            op_args[k] = self.attrs[k]
+
+        # create and run sgd operator
+        adam_op = Operator("adam", **op_args)
+        adam_op.run(scope, place)
+
+        for key, np_array in self.outputs.iteritems():
+            out_var = scope.var(key).get_tensor()
+            actual = np.array(out_var)
+            actual = actual.reshape([actual.size])
+            np_array = np_array.reshape([np_array.size])
+            for idx, row_id in enumerate(self.rows):
+                j = 0
+                while j < self.row_numel:
+                    pos = row_id * self.row_numel + j
+                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
+                                    0.00001)
+                    j += 1
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index f6120aedecf1015c279b8f218f5e37f2e598ab91..01321de8eac34d562d99726b1f4125d1932ab40f 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
 
@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
         total_sum = layers.sums(input=[a_sum, x_sum])
         total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
 
-        append_backward_ops(total_sum_scaled)
+        append_backward(total_sum_scaled)
 
         g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index 2b9d8f351a2836cd723d629d4790de1e068d0ea3..7d815123f3454d1457f59202219f9a93bf3d8c31 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
         loss = layers.mean(x=out)
-        append_backward_ops(loss=loss)
+        append_backward(loss=loss)
         outs = exe.run(
             feed={'X': x},
             fetch_list=[
diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..080a9743b0182cb7e6dd0030fc306a7f82510a05
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
@@ -0,0 +1,57 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "detection_output"
+        self.init_test_case()
+
+        #loc.shape ((1, 4, 4, 1, 1))
+        #conf.shape ((1, 4, 2, 1, 1))
+
+        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
+        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
+                          [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
+        priorbox = np.array([
+            0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
+            0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
+            0.8, 0.8, 0.1, 0.1, 0.2, 0.2
+        ])
+
+        output = np.array([
+            0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
+        ])
+        self.inputs = {
+            'Loc': loc.astype('float32'),
+            'Conf': conf.astype('float32'),
+            'PriorBox': priorbox.astype('float32')
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'top_k': self.top_k,
+            'nms_top_k': self.nms_top_k,
+            'background_label_id': self.background_label_id,
+            'nms_threshold': self.nms_threshold,
+            'confidence_threshold': self.confidence_threshold,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.num_classes = 2
+        self.top_k = 10
+        self.nms_top_k = 20
+        self.background_label_id = 0
+        self.nms_threshold = 0.01
+        self.confidence_threshold = 0.01
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02c59284e1ca2e28ba2f6c5ec13b241c15fc288
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
@@ -0,0 +1,347 @@
+import numpy
+import random
+import collections
+import paddle.v2.fluid as fluid
+import unittest
+from decorators import *
+
+
+class Memory(object):
+    def __init__(self, shape, dtype='float32'):
+        self.ex = numpy.zeros(shape=shape, dtype=dtype)
+        self.cur = None
+
+    def update(self, val):
+        assert val.shape == self.ex.shape
+        assert val.dtype == self.ex.dtype
+        self.cur = val
+
+    def ex(self):
+        return self.ex
+
+    def next(self):
+        self.ex = self.cur
+        self.cur = None
+
+    def __next__(self):
+        self.next()
+
+    def reset(self):
+        self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
+        self.cur = None
+
+
+class Output(object):
+    def __init__(self):
+        self.outs = []
+
+    def next_sequence(self):
+        self.outs.append([])
+
+    def out(self, val):
+        self.outs[-1].append(val)
+
+    def last(self):
+        return self.outs[-1][-1]
+
+
+class BaseRNN(object):
+    def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
+        self.num_seq = num_seq
+        self.inputs = collections.defaultdict(list)
+
+        for _ in xrange(num_seq):
+            seq_len = random.randint(1, max_seq_len - 1)
+            for iname in ins:
+                ishape = ins[iname].get('shape', None)
+                idtype = ins[iname].get('dtype', 'float32')
+                lst = []
+                for _ in xrange(seq_len):
+                    lst.append(numpy.random.random(size=ishape).astype(idtype))
+                self.inputs[iname].append(lst)
+
+        self.mems = dict()
+        for mname in mems:
+            mshape = mems[mname].get('shape', None)
+            mdtype = mems[mname].get('dtype', 'float32')
+            self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
+
+        self.params = dict()
+        for pname in params:
+            pshape = params[pname].get('shape', None)
+            pdtype = params[pname].get('dtype', 'float32')
+            self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
+
+        self.outputs = dict()
+
+        for oname in outs:
+            self.outputs[oname] = Output()
+
+    def step(self, **kwargs):
+        raise NotImplementedError()
+
+    def exe(self):
+        retv = dict()
+        for out in self.outputs:
+            retv[out] = []
+
+        for seq_id in xrange(self.num_seq):
+            for mname in self.mems:
+                self.mems[mname].reset()
+            for out in self.outputs:
+                self.outputs[out].next_sequence()
+
+            iname0 = self.inputs.keys()[0]
+            seq_len = len(self.inputs[iname0][seq_id])
+
+            for step_id in xrange(seq_len):
+                xargs = dict()
+
+                for iname in self.inputs:
+                    xargs[iname] = self.inputs[iname][seq_id][step_id]
+
+                for mname in self.mems:
+                    xargs[mname] = self.mems[mname]
+
+                for pname in self.params:
+                    xargs[pname] = self.params[pname]
+
+                for out in self.outputs:
+                    xargs[out] = self.outputs[out]
+
+                self.step(**xargs)
+
+                for mname in self.mems:
+                    next(self.mems[mname])
+
+            for out in self.outputs:
+                retv[out].append(self.outputs[out].last())
+
+        for out in retv:
+            retv[out] = numpy.array(retv[out])
+        return retv
+
+    def to_feed(self, place):
+        feed_dict = dict()
+
+        for iname in self.inputs:
+            lod = [0]
+            np_flatten = []
+            for seq_id in xrange(len(self.inputs[iname])):
+                seq_len = len(self.inputs[iname][seq_id])
+                lod.append(lod[-1] + seq_len)
+                np_flatten.extend(self.inputs[iname][seq_id])
+
+            t = fluid.Tensor()
+            t.set(numpy.array(np_flatten), place)
+            t.set_lod([lod])
+            feed_dict[iname] = t
+
+        for pname in self.params:
+            feed_dict[pname] = self.params[pname]
+        return feed_dict
+
+    def get_numeric_gradient_of_param(self, param_name, delta=0.001):
+        p = self.params[param_name]
+        if len(p.shape) != 2:
+            raise ValueError("Not support get numeric gradient of an parameter,"
+                             " which is not matrix")
+        g = numpy.zeros(shape=p.shape, dtype=p.dtype)
+
+        for i in xrange(p.shape[0]):
+            for j in xrange(p.shape[1]):
+                o = p[i][j]
+                p[i][j] += delta
+                pos = self._exe_mean_out_()
+                p[i][j] -= 2 * delta
+                neg = self._exe_mean_out_()
+                p[i][j] = o
+                g[i][j] = (pos - neg) / (delta * 2)
+        return g
+
+    def get_numeric_gradient_of_input(self,
+                                      input_name,
+                                      delta=0.001,
+                                      return_one_tensor=True):
+        ipt = self.inputs[input_name]
+        grad = []
+
+        for seq in ipt:
+            seq_grad = []
+            for item in seq:
+                item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
+                if len(item.shape) != 1:
+                    raise ValueError("Not support")
+
+                for i in xrange(len(item)):
+                    o = item[i]
+                    item[i] += delta
+                    pos = self._exe_mean_out_()
+                    item[i] -= 2 * delta
+                    neg = self._exe_mean_out_()
+                    item[i] = o
+                    item_grad[i] = (pos - neg) / (delta * 2)
+                seq_grad.append(item_grad)
+            grad.append(seq_grad)
+
+        if not return_one_tensor:
+            return grad
+
+        for i in xrange(len(grad)):
+            grad[i] = numpy.concatenate(grad[i])
+        grad = numpy.concatenate(grad)
+        return grad
+
+    def _exe_mean_out_(self):
+        outs = self.exe()
+        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+
+
+class TestSimpleMul(unittest.TestCase):
+    DATA_NAME = 'X'
+    DATA_WIDTH = 32
+    PARAM_NAME = 'W'
+    HIDDEN_WIDTH = 10
+    OUT_NAME = 'Out'
+
+    class SimpleMul(BaseRNN):
+        def __init__(self):
+            base = TestSimpleMul
+            super(base.SimpleMul, self).__init__({
+                base.DATA_NAME: {
+                    'shape': [base.DATA_WIDTH]
+                }
+            }, {}, {
+                base.PARAM_NAME: {
+                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
+                }
+            }, [base.OUT_NAME])
+
+        def step(self, X, W, Out):
+            Out.out(numpy.matmul(X, W))
+
+    # Test many times in local to ensure the random seed cannot breaks CI
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMul.SimpleMul()
+        dat = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        dat.stop_gradient = False
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(dat)
+            o = fluid.layers.fc(input=d,
+                                param_attr=self.PARAM_NAME,
+                                bias_attr=False,
+                                size=self.HIDDEN_WIDTH,
+                                act=None)
+            rnn.output(o)
+
+        out = rnn()
+        out = fluid.layers.sequence_pool(out, pool_type='last')
+        loss = fluid.layers.mean(x=out)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        out, w_g, i_g = map(numpy.array,
+                            exe.run(feed=py_rnn.to_feed(cpu),
+                                    fetch_list=[
+                                        out, self.PARAM_NAME + "@GRAD",
+                                        self.DATA_NAME + "@GRAD"
+                                    ],
+                                    return_numpy=False))
+        out_by_python = py_rnn.exe()[self.OUT_NAME]
+        self.assertTrue(numpy.allclose(out, out_by_python))
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(
+            input_name=self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
+
+
+class TestSimpleMulWithMemory(unittest.TestCase):
+    DATA_WIDTH = 32
+    HIDDEN_WIDTH = 20
+    DATA_NAME = 'X'
+    PARAM_NAME = 'W'
+
+    class SimpleMulWithMemory(BaseRNN):
+        def __init__(self):
+            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
+                TestSimpleMulWithMemory.DATA_NAME: {
+                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
+                }
+            }, {'Mem': {
+                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
+            }}, {
+                TestSimpleMulWithMemory.PARAM_NAME: {
+                    'shape': [
+                        TestSimpleMulWithMemory.DATA_WIDTH,
+                        TestSimpleMulWithMemory.HIDDEN_WIDTH
+                    ]
+                }
+            }, ['Out'])
+
+        def step(self, X, Mem, W, Out):
+            o = numpy.matmul(X, W)
+            assert isinstance(Mem, Memory)
+            o += Mem.ex
+            Mem.update(o)
+            assert isinstance(Out, Output)
+            Out.out(o)
+
+    # many_times used locally for debug. Make sure the calculation is stable.
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
+        data = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        data.stop_gradient = False
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(data)
+            mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
+            hidden = fluid.layers.fc(input=d,
+                                     size=self.HIDDEN_WIDTH,
+                                     param_attr=self.PARAM_NAME,
+                                     bias_attr=False,
+                                     act=None)
+            o = fluid.layers.elementwise_add(x=hidden, y=mem)
+            rnn.update_memory(mem, o)
+            rnn.output(o)
+
+        out = rnn()
+        last = fluid.layers.sequence_pool(input=out, pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        feed = py_rnn.to_feed(cpu)
+        last_np, w_g, i_g = map(numpy.array,
+                                exe.run(feed=feed,
+                                        fetch_list=[
+                                            last, self.PARAM_NAME + "@GRAD",
+                                            self.DATA_NAME + "@GRAD"
+                                        ],
+                                        return_numpy=False))
+        last_by_py, = py_rnn.exe().values()
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(last_np, last_by_py))
+
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+
+        # Since this RNN has many float add. The number could be not stable.
+        # rtol = 0.1
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 9d2dcca56dd1361b9e2448be9f1d5403f8ee17e3..77f0f11f1bcd5fa88700a33eec5a2abc2666ed02 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase):
                 name='x_t_data', shape=[10, 10], dtype='float32')
             x_t = layers.fc(input=x_t_data, size=10)
             prev_hidden_data = layers.data(
-                name='prev_hidden_data', shape=[10, 20], dtype='float32')
-            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
             prev_cell_data = layers.data(
                 name='prev_cell', shape=[10, 30], dtype='float32')
             prev_cell = layers.fc(input=prev_cell_data, size=30)
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index 5fdabbcf889448114ac4e55e7944cb6c57ba5f3c..c552cb033f1ec8f5843490083edee7b2762b5703 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -4,7 +4,7 @@ import numpy
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
             mean = layers.mean(x=result)
 
-            append_backward_ops(mean)
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
diff --git a/python/paddle/v2/fluid/tests/test_norm_op.py b/python/paddle/v2/fluid/tests/test_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d56320489b24c5547e045cb51b778851ff94a32
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_norm_op.py
@@ -0,0 +1,57 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def norm(input, scale, epsilon):
+    s0, s1, s2, s3 = input.shape
+    x_square = input * input
+    for i in xrange(s0):
+        input_batch = input[i:i + 1, :, :, :]
+        input_batch = input_batch.reshape(s1, s2 * s3)
+        x_square_batch = x_square[i:i + 1, :, :, :]
+        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
+        square_colsum = x_square_batch.sum(axis=0) + epsilon
+        tmp = pow(square_colsum, 0.5)
+        tmp = np.reciprocal(tmp)
+        tmp_tile = np.tile(tmp, s1)
+        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
+        scale_tile = np.tile(scale, (1, s2 * s3))
+        scale_tile = scale_tile.reshape(s1, s2 * s3)
+        out_batch = input_batch * tmp_tile * scale_tile
+        out_batch = out_batch.reshape(1, s1, s2, s3)
+        if i == 0:
+            out = out_batch
+        else:
+            out = np.concatenate((out, out_batch), 0)
+    out.reshape(s0, s1, s2, s3)
+    return out
+
+
+class TestNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "norm"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        scale = np.array([10, 10, 10])
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Scale': scale.astype('float32')
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        output = norm(input, scale, self.epsilon)
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.shape = [2, 3, 2, 2]
+        self.epsilon = 1e-6
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 29694be58bce0eb41b05439da35ef07a542ef12a..1eadb7d912629024ee21e30b0a5fa4910bb96e06 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -2,7 +2,7 @@ import unittest
 
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestOptimizer(unittest.TestCase):
@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adagrad_optimizer = self.MockAdagrad(
             learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adam_optimizer = self.MockAdam(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
         learning_rate = 0.01
         adamax_optimizer = self.MockAdamax(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         learning_rate = 0.01
         decayed_adagrad_optimizer = self.MockDecayedAdagrad(
             learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index e38c763ddbcc5c8410f41d062c05499333a3ee55..84f4e36fa7312fbcb96cc66ff26e234c3016df30 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
     def test_backward(self):
         self.check_forward()
 
-        append_backward_ops(self.output)
+        append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
index 24baf55e90c98f39bab926e8c85a791eee5ed4a4..890c881a126a32344128652691c6cad45e02e82d 100644
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
             dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
index 8f5774835e02191a068e86ea56f3f877c464a391..7c136f6360ce73a7c532b5486e544796e6853bcb 100644
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
         new_dat = fluid.layers.reorder_lod_tensor_by_rank(
             x=dat, rank_table=table)
         loss = fluid.layers.mean(x=new_dat)
-        fluid.backward.append_backward_ops(loss=loss)
+        fluid.backward.append_backward(loss=loss)
 
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index 9999165ed509aa40f31f26aa676f381561bd0016..d1bb20f37a3785f70bee072b9df282bba4012c16 100644
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -2,7 +2,7 @@ import unittest
 
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
 
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 86db4c64b493d94cc675ed4bcee7e2925fef1977..be1588fc2d09fa58882425eb3d080ef1560ebc79 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
 
@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
         self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
 
         mem3_mean = layers.mean(x=mem3)
-        append_backward_ops(loss=mem3_mean)
+        append_backward(loss=mem3_mean)
         x_grad = exe.run(
             feed={'x': tensor},
             fetch_list=[main_program.global_block().var('x@GRAD')])[0]
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
index 8cdd59ff3cc7deb57252fc5218d239f86016cb9c..2e4defd55d75c2012f39bea30a6c4de12528e77c 100644
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -4,7 +4,7 @@ import numpy as np
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
             mean = layers.mean(x=out)
 
-            append_backward_ops(mean)
+            append_backward(mean)
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index 033b03a4957131e1155c61e8ed2f10eefb23fda4..7c5593cc5e5a66d4ccb237e3706ff3e544adf033 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
 
 
@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
         sum_result = layers.array_read(array=mem_array, i=i)
         loss = layers.mean(x=sum_result)
 
-        append_backward_ops(loss)
+        append_backward(loss)
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)