diff --git a/Dockerfile b/Dockerfile
index 870304a6acc99e715dffbfabd8058be000b6872c..c257dbfc2987323f8ed2a24dfffa8b3c15e09399 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,8 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
+
+# When you modify it, please be aware of cudnn-runtime version 
+# and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
@@ -46,7 +49,11 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 RUN curl -s -q https://glide.sh/get | sh
 
 # Install TensorRT
-# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 0c07d36bed65400164853b99f18ec0335341cd94..ac19b1651893f18b14c62a0986df75bed25d7e80 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -30,4 +30,6 @@ if(TENSORRT_FOUND)
 
     message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
         "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
 endif()
diff --git a/doc/fluid/api/data.rst b/doc/fluid/api/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b56c7332cc284649c7e04328e51a7faa78593a39
--- /dev/null
+++ b/doc/fluid/api/data.rst
@@ -0,0 +1,10 @@
+==================================
+Data Reader Interface and DataSets
+==================================
+
+..  toctree::
+    :maxdepth: 1
+
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7c896a6270b488ca4449e5211d0d0879eda6ac5
--- /dev/null
+++ b/doc/fluid/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/fluid/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/fluid/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
index ae9daeb7918d773d7330f419de96c6972a836710..f80b87c7d2704a144c02028c4925530a67d11289 100644
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,17 +5,24 @@
 evaluator
 =========
 
-Accuracy
---------
+ChunkEvaluator
+--------------
 
-..  autoclass:: paddle.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
     :members:
     :noindex:
 
-ChunkEvaluator
+EditDistance
 --------------
 
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.EditDistance
     :members:
     :noindex:
 
+DetectionMAP
+--------------
+
+..  autoclass:: paddle.fluid.evaluator.DetectionMAP
+    :members:
+    :noindex:
+  
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index b0710d8b19956eb235890fdb2a2d764084416aa5..06c686d9508635abd41571983e00be174e94743e 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -16,3 +16,4 @@ Fluid
     profiler.rst
     regularizer.rst
     io.rst
+    data.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index f186c9c85a640da49d95a1a62c721b09b3007d83..2f02c5de097945a45a3e053427104bd17bea1279 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -67,8 +67,7 @@ XavierInitializer
 ..  autoclass:: paddle.fluid.initializer.XavierInitializer
     :members:
     :noindex:
-    MSRA
-    ------
+
 
 MSRAInitializer
 -----------------
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 2f820595c35c2bccd6a5c8a20c60d796c04c8e97..7a92caf9b7139cf091eff834dbed3586b23ac3af 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,10 +47,51 @@ DecayedAdagrad
     :members:
     :noindex:
 
+SGDOptimizer
+------------
+
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+    :members:
+    :noindex:
+
+MomentumOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+    :members:
+    :noindex:
+
+AdagradOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+
+AdamOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+
+AdamaxOptimizer
+---------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+
+DecayedAdagradOptimizer
+-----------------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+    :members:
+    :noindex:
+
 Adadelta
 --------------
 
 ..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
     :members:
     :noindex:
-
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index dc9740c46392567d314121ac401540b0e7382703..837c67111c6e98e6a3859be802addc20a1c64f2b 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -25,3 +25,16 @@ L2Decay
     :members:
     :noindex:
 
+L1DecayRegularizer
+---------------------
+
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+    :members:
+    :noindex:
+
+L2DecayRegularizer
+---------------------
+
+..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+    :members:
+    :noindex:
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 940d37fb31dcd0c50ea6c4c42b052d7cb23a9c47..340bc302d57429a9bf10a9d23ed9b0cdc7a2a568 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -49,9 +49,9 @@ In the new design, we propose to create a new operation for averaging parameter
 - the optimizer
 - the window_size to keep the updates
 
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
 
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 
 ### Python API implementation for ParameterAverageOptimizer
 
@@ -59,8 +59,8 @@ Based on Polyak and Juditsky (1992), we can generalize the averaging of updates
 - Any optimizer (RMSProp , AdaGrad etc.)
 - A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
 
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
 
 #### Creation of the ParameterAverageOptimizer operator
 There are two ways for creating the ParameterAverageOptimizer op:
@@ -71,4 +71,4 @@ The proposal is to add the op immediately while building the computation graph.
 
 #### High-level API
 
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 3b626bd89cd83a9428997abccfeeebbbbdbb3d38..3757cd055c818be1e63ee8c0f000f4dd299b59f4 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -113,7 +113,7 @@ if (cond) {
 
 ```
 
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
 
 ```python
 import paddle as pd
@@ -140,7 +140,7 @@ The difference is that variables in the C++ program contain scalar values, where
 
 ### Blocks with `for` and `RNNOp`
 
-The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
 
 ```python
 x = sequence([10, 20, 30]) # shape=[None, 1]
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
index 2d4b371cc56db82ce5747da6db07f05aa7f7e6c1..3fcddf4dd90f826ee1a16713f4371fb010f8eac5 100644
--- a/doc/fluid/design/concepts/executor.md
+++ b/doc/fluid/design/concepts/executor.md
@@ -1,7 +1,7 @@
 # Executor Design Doc
 
 ## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
 [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
 The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
index bd2456787c4e336d357a65255a8274a7c9e465cc..cfcd21ecdb9d2844bf93ed98a56db09651077c40 100644
--- a/doc/fluid/design/concepts/program.md
+++ b/doc/fluid/design/concepts/program.md
@@ -4,7 +4,7 @@
 
 A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
 
-A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
 
 ```python
 x = layer.data("images")
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index 1859f983e9133674e69ecd506d7683ea926b2b8f..0428e74f9e00a87f6b0972057f48479b8ae56ad6 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -1,6 +1,6 @@
 # Design Doc: Concurrent Programming with Fluid
 
-With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
 
 Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
 
@@ -28,19 +28,19 @@ The following table compares concepts in Fluid and Go
 <tr>
 <td>control-flow and built-in functions </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
 <td></td>
 </tr>
 <tr>
 <td>goroutines, channels </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
 <td></td>
 </tr>
 <tr>
 <td>runtime </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
 <td></td>
 </tr>
 </tbody>
@@ -78,7 +78,7 @@ message ProgramDesc {
 }
 ```
 
-Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
 
 The default `main` function is defined as follows:
 
@@ -146,7 +146,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
@@ -175,7 +175,7 @@ where
   1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
   2. once a connection is established,
      1. creates a scope of two parameters, "input" and "output",
-     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
      3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
 
 ## Summarization
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index 229cb47c17d633be6848bb35e58d33ec9b47ec3b..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -177,7 +177,7 @@ The local training architecture will be the same as the distributed training arc
 ### Training Data
 
 In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
+with [data reader](./README.md) from Python. This approach is
 no longer efficient when training distributedly since the Python
 process no longer runs on the same node with the trainer processes,
 the Python reader will need to read from the distributed filesystem
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 73c85da5e89eee0ac7857a0b808bc64ae673fdad..563b70bc0e852bec953eb40dda3c46b3d45d7e68 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -65,7 +65,7 @@ For embedding layers, the gradient may have many rows containing only 0 when tra
 if the gradient uses a dense tensor to do parameter optimization,
 it could spend unnecessary memory, slow down the calculations and waste
 the bandwidth while doing distributed training.
-In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 7b61b050f640814d6949cf6847b431da53d59581..b39ae0675c45e56852293d97f45e91861cf31667 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
 
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
 </p>
 
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9493908f4f73b3e7d91f5f6364a2a3660257d508
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
@@ -0,0 +1,175 @@
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, 
+which supports the variable length sequences without padding. 
+This doc will design fluid's RNN based on this idea.
+
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+Or more clearly defined here
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+
+The definition of `LODTensor` is as follows:
+
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+
+1. Implement the transfer as `shared_ptr`
+    - Do not modify the contents of `lod_start_pos` as a consumer
+    - Modify producer of `lod_start_pos` as producer
+    - Conventions consumer only needs to copy `shared_ptr` passed over
+    - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+    - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+
+For example, the original input:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+In order to track the changes before and after sorting, use here
+
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+
+## SegmentInputs
+
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
index e9f55214f411abb11bef180d7af4716ad85a0b09..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914 100644
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
@@ -9,7 +9,7 @@
   concepts/index_cn.rst
   data_type/index_cn.rst
   memory/index_cn.rst
-  muti_devices/index_cn.rst
+  multi_devices/index_cn.rst
   dynamic_rnn/index_cn.rst
   concurrent/index_cn.rst
   algorithm/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
index 2802dc3a31d540c5a19bf9042053496aad152f98..2bfee02ad4626633b08ddff747e2886faf9ba99f 100644
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
@@ -9,7 +9,7 @@ Design
   concepts/index_en.rst
   data_type/index_en.rst
   memory/index_en.rst
-  muti_devices/index_en.rst
+  multi_devices/index_en.rst
   dynamic_rnn/index_en.rst
   concurrent/index_en.rst
   algorithm/index_en.rst
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b..265732a348ea77d21005e335390d99abcdfbd045 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -36,7 +36,7 @@ Please be aware that these Python classes need to maintain some construction-tim
 
 ### Program
 
-A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
 
 Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
 
@@ -70,7 +70,7 @@ class Program(objects):
 
 ### Block
 
-A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
 
 1. a map from variable names to an instance of the Python `Variable` class, and
 1. a list of `Operator` instances.
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
index 8cd5ff71d193f03e1ac923724b52f28c6057d25d..519a9143033386678351ff78a465e5ba6e220c52 100644
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -32,9 +32,9 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op
 
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
 
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 
 ### Computation Graph
 
@@ -48,7 +48,7 @@ The Python API will modify this computation graph to add regularization operator
    
 ### Python API implementation for Regularization
 
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
 
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
@@ -63,4 +63,4 @@ Since we want to create the regularization ops in a lazy manner, the regularizat
 
 #### High-level API
 
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
index 2a6beafc52e815fa067b273bb5887ddcf6ab15ae..6dd3840a0734e8593890dcf8044746197350c6f5 100644
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -23,7 +23,7 @@ func paddlepaddle() {
 }
 ```
 
-This program consists of a [block](block.md) of three operators --
+This program consists of a [block](../concepts/block.md) of three operators --
 `read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
 the following
 
@@ -39,7 +39,7 @@ message ProgramDesc {
   }
 }
 ```
- 
+
 ## Transpilers
 
 We can write a transpiler program that takes a `ProgramDesc`, e.g.,
@@ -93,7 +93,7 @@ specific hardware platform, for example, the `mult` operator, the
 generated code should call its CUDA kernel:
 
 ```c++
-paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
                                const paddle::Tensor& b) {
   paddle::Tensor t;
   paddle::operator::Mult m(a, b, ...);
@@ -107,4 +107,4 @@ where `cuda_context` could be a global variable of type
 ## Multi-Block Code Generation
 
 Most Fluid application programs may have more than one blocks.  To
-execute them, we need to trace [scopes](scope.md).
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index f199cc892f5e84f0a12abe3b8e5cace9849e7fa8..4e1d660cef6369f04db8e1e83360f6af25259f96 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -11,7 +11,7 @@ The goals of refactoring include:
 
 1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
 
-  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
 
 1. Users write Python programs to describe the graphs and run them (locally or remotely).
 
@@ -28,7 +28,7 @@ The goals of refactoring include:
       1. the C++ library `libpaddle.so` for local execution,
       1. the master process of a distributed training job for training, or
       1. the server process of a Kubernetes serving job for distributed serving.
-   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
 
 ## Description and Realization of Computation Graph
 
@@ -48,16 +48,16 @@ At runtime, the C++ program realizes the graph and runs it.
 <tr>
 <td>Data</td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
 </tr>
 <tr>
 <td>Operation </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
 </tr>
 <tr>
 <td>Block </td>
@@ -85,7 +85,7 @@ The word *graph* is interchangeable with *block* in this document.  A graph cons
 
 1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
 
-   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
       1. realize local variables defined in the BlockDesc message in the new scope,
       1. a scope is similar to the stack frame in programming languages,
 
@@ -195,7 +195,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 ## Related Concepts
 
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
 
 ### Register Macros
 ```cpp
@@ -236,7 +236,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
 	* All operations on `Tensor` are written in `Operator` or global functions.
-	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
 * `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
 	* `step_scopes` in RNN is a variable and not a tensor.
 * `Scope` is where variables are stored.
diff --git a/doc/fluid/design/muti_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
similarity index 100%
rename from doc/fluid/design/muti_devices/index_cn.rst
rename to doc/fluid/design/multi_devices/index_cn.rst
diff --git a/doc/fluid/design/muti_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
similarity index 100%
rename from doc/fluid/design/muti_devices/index_en.rst
rename to doc/fluid/design/multi_devices/index_en.rst
diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
similarity index 80%
rename from doc/fluid/design/muti_devices/kernel_hint_design.md
rename to doc/fluid/design/multi_devices/kernel_hint_design.md
index 58e44b64169d8c942174de86986403570b271641..6edc14ca73b1abf824981b59511a9aca4e0f3b47 100644
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -1,7 +1,7 @@
 # Kernel Hint Design
 
 ## Problem
-In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 
 In the current design, we use KernelType to describe one kernel.
 
@@ -14,7 +14,7 @@ struct KernelType {
 ```
  `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
 
-The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
 
 So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
 
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
similarity index 100%
rename from doc/fluid/design/muti_devices/kernel_selection.md
rename to doc/fluid/design/multi_devices/kernel_selection.md
diff --git a/doc/fluid/design/muti_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
similarity index 97%
rename from doc/fluid/design/muti_devices/operator_kernel_type.md
rename to doc/fluid/design/multi_devices/operator_kernel_type.md
index f86e6b7a564ed23f2bddbec25da1c110014f941d..8c1bc8f76a337006497e5ab5e5a710f9f49261b8 100644
--- a/doc/fluid/design/muti_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -8,7 +8,7 @@ struct OpKernelType {
   proto::DataType data_type_;
 };
 ```
-For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
 
 It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
 
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
index f13d30ca9fe09c9525c711436f605bb280e11000..b95773c50ca0dcbd1b93529332e035d4de90faa8 100644
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -11,7 +11,7 @@ In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` imp
 
 There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
 
-During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
 
 For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab1c2ff23cfff586516876684348bb15bd2084fc
Binary files /dev/null and b/doc/fluid/design/onnx/images/project_structure.png differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc1665d7c33eb54cb63e5306a439c1ca67016d1e
--- /dev/null
+++ b/doc/fluid/design/onnx/onnx_convertor.md
@@ -0,0 +1,131 @@
+# Background
+
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+
+
+# How it works
+
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+
+Here are a few major considerations when it comes to converting models:
+
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+   1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+   2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+
+
+# Project structure
+
+<p align="center">
+<img src="./images/project_structure.png"/>
+</p>
+
+The project contains four important parts:
+
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+
+
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+
+* Convert Fluid inference model to ONNX binary model
+
+    ```
+    python convert.py --fluid_model <fluid inference model> --onnx_model <ONNX model> validate True
+    ```
+
+* Validate the converted model
+
+    ```
+    python validate.py --fluid_model <fluid inference model> --onnx_model <ONNX model>
+    ```
+
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+
+
+# Challenges and mitigation
+
+## Cycles
+
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+
+## Sequences
+
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+
+## Ops that can't easily be mapped
+
+There are ops that just aren't possible to map today:
+
+**Control flow operators**
+
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+
+
+**Non-existent in Fluid**
+
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+
+
+**Concurrency**
+
+ONNX does not have any considerations for concurrency right now.
+
+*Resolution*: There are two ways to approach this:
+
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+
+
+**Overloaded in Fluid**
+
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+
+*Resolution*: Chain multiple Paddle operators.
+
+
+## Lack of LoDTensors
+
+As stated above, ONNX only supports simple Tensor values.
+
+*Resolution*: Deprecate to plain old numpy-able tensors.
+
+
+## Reconstruction from deprecated ONNX ops
+
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+
+
+# Supported models
+
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..955216ca62e71b4d3666e1662aa86c9495d2e7d6
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f9fc68c37e17a8a365b0d7fae86c16b0d094631f
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index ad798003f560e7fb0e6db6083fdd152fd3417584..37e608160db0ad5a92297987937bbbfa8f842ea8 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,8 @@
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
   api_doc_std_cn.md
   new_op_cn.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 80c899a82fa452c5cd8f38dad89c15d3041b09e3..d7f83035010f13c30514673ecbee301f194dc175 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,8 @@ Development
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_en.md
+  write_docs_en.md
   api_doc_std_en.md
   new_op_en.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
index 75830ef28c67dc4694d899efe503084b7b5852e1..6b4244d0f506c8cd6c08739141eabad27c581ca7 100644
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -4,7 +4,7 @@ To make the operator document itself more clear, we recommend operator names obe
 
 ## OpProtoMaker names
 
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
 
 - Input/Output.
   - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 0c3f88d9c31e05bec399c64bf6ade56e62e01f68..587d819f79fcf82549826359fbf04ad3af404446 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -54,10 +54,10 @@
 </table>
 
 
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
 
 
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
 ## 实现C++类
@@ -85,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
 
 
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
 
 ```cpp
 template <typename AttrType>
@@ -103,21 +103,21 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
-    AddComment(R"DOC(Scale operator
-The equation is: Out = scale*X
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
   }
 };
 ```
 
-这个例子有两处不同：
-
-- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
 ### 定义Operator类
@@ -147,7 +147,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -173,7 +173,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -201,10 +201,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
@@ -215,7 +214,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
                   ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -223,8 +224,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
    在上面的代码中：
 
-    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
-    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
@@ -255,7 +255,7 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
 
 ### 前向Operator单测
 
@@ -315,7 +315,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 
 ### 编译和执行
 
-`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
 请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
@@ -331,7 +331,6 @@ ctest -R test_mul_op
 
 ## 注意事项
 
-- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
-- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index a566a09131f86251b70d5435d0a483aa2a705b35..f8de271ed4e5e0fb4018478bffd4b525d4319738 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -26,13 +26,6 @@ Here are the base types needed. For details, please refer to the design docs.
 Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 
 
- Information           | Where is it defined
---------------  | :----------------------
-OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
-Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
-
 <table>
 <thead>
 <tr>
@@ -61,10 +54,10 @@ Registering the Op           | Ops are registered in `.cc` files; For Kernel reg
 </table>
 
 
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
 
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 
 
 ## Implementing C++ Types
@@ -92,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
 
    - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
    - `framework::OpAttrChecker` is used to validate variable attributes.
 
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
 
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
 
 
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
 
 ```cpp
 template <typename AttrType>
@@ -120,11 +113,7 @@ The equation is: Out = scale*X
 };
 ```
 
-There are two changes in this example:
-
-- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
 ### Defining Operator
@@ -154,7 +143,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -180,7 +169,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -209,9 +198,9 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
 
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
 
 
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@@ -224,7 +213,9 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
 
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
@@ -233,9 +224,8 @@ The definition of its corresponding backward operator, if applicable, is similar
 
    In that code block,
 
-    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
@@ -275,7 +265,7 @@ Unit tests for an operator include
 
 3. a scaling test for the backward operator.
 
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
 
 ### Testing Forward Operators
 
@@ -339,7 +329,7 @@ Some key points in checking gradient above include:
 ### Compiling and Running
 
 
-Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
 
 Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
 
@@ -357,7 +347,6 @@ ctest -R test_mul_op
 
 ## Remarks
 
-- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
-- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
 - If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
index 55dea8d0a39232ede59d4663d6e1a47fbfc60853..87e617d44041bde9c9051151878ffb4304689b3c 100644
--- a/doc/fluid/dev/new_op_kernel.md
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -4,13 +4,13 @@
 
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
 
-[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
 
 ## Write Kernels for A New Device
 
 ### Add A New Device
 
-  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
 
 To register a new device, we need to add an enum value to `LibraryType`:
 
@@ -23,9 +23,9 @@ enum class LibraryType {
 ```
 
 
-### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
 
-If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
 
 ```cpp
 struct CUDAPlace {
@@ -45,8 +45,8 @@ struct CUDAPlace {
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 ```
 
-### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
-After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
 
 ```cpp
 class DeviceContext {
@@ -58,9 +58,9 @@ class DeviceContext {
 };
 ```
 
-### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
 
-A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
 
 ```cpp
 class OpKernelBase {
@@ -101,7 +101,7 @@ REGISTER_OP_KERNEL(
 
 kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
 
-take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
 
 	```cpp
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
index 8983df900460127fc130043c52373dab505363ba..051a463cfcf97df2e2d5b6a880923ca70fefbd6e 100644
--- a/doc/fluid/dev/support_new_device.md
+++ b/doc/fluid/dev/support_new_device.md
@@ -13,7 +13,7 @@ So, how to support a new Device/Library in Fluid becomes a challenge.
 
 ## Basic: Integrate A New Device/Library
 
-For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
 
 There are mainly three parts that we have to consider while integrating a new device/library:
 
@@ -28,7 +28,7 @@ There are mainly three parts that we have to consider while integrating a new de
 Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
 
 ```
         |   CPUPlace
@@ -44,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
 
 #### DeviceContext
 
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
 
 
 ```
@@ -73,7 +73,7 @@ class CUDADeviceContext : public DeviceContext {
   Place GetPlace() const override { return place_; }
 private:
   CUDAPlace place_;
-  cudaStream_t stream_; 
+  cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
 };
@@ -84,7 +84,7 @@ private:
 
 #### memory module
 
-Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
 
 ```
 template <typename Place>
@@ -102,7 +102,7 @@ To implement these interfaces, we have to implement MemoryAllocator for differen
 
 #### Tensor
 
-[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
 
 ```cpp
 class Tensor {
@@ -161,7 +161,7 @@ t.mutable_data(place);
 
 Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
 
-Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
 
 The interface is defined in the header file.
 
@@ -210,7 +210,7 @@ The implementation of `OpKernel` is similar to math functors, the extra thing we
 Fluid provides different register interfaces in op_registry.h
 
 
-Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
 
 In .cc file:
 
@@ -236,5 +236,5 @@ Generally, we will implement OpKernel for all Device/Library of an Operator. We
 
 For more details, please refer to following docs:
 
-- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
-- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..2c281eaaf43bbfad84c3be9ed1d1bd0dbc77fa9b
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..cb2b9b0ff1f1d9e0e5201d160f6b7d9d451374e2
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
deleted file mode 120000
index 93a9e4e37a8495c553cec257c27363ca8d062d39..0000000000000000000000000000000000000000
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..135beb75d0330f39d062753aa2aa83a077f36bb1
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1,45 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考： :ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+  
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model 
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program, 
+                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97 
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
deleted file mode 120000
index 6e1894faa1176bb9e77f616e07df36191e54b782..0000000000000000000000000000000000000000
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..df6619cfd039fc1fdca8cde57db9cc6aebf8f029
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1,49 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+  
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model 
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program, 
+                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97 
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
index 36516b7953224e799e1065fd7930509eec0aa650..cbff313fc5b9468b58159cf2b04e8464f9bebc78 100644
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -6,6 +6,7 @@ PaddlePaddle adheres to the following three sections of code and document specif
 
 
 PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/faq/local/index_en.rst b/doc/v2/faq/local/index_en.rst
index 4cb43031933a8bbe9aebae04bc3e9c74a6d21b95..fa95b1753dbe293811d7a8601497ad521fa3ecda 100644
--- a/doc/v2/faq/local/index_en.rst
+++ b/doc/v2/faq/local/index_en.rst
@@ -1,5 +1,248 @@
 #############################
-Local Training and Prediction
+Parameter Setting
 #############################
 
-TBD
+..  contents::
+
+1. Reduce Memory Consumption
+-------------------
+
+The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes:
+\:
+
+* Cache memory for DataProvider (only on host memory),
+* Memory for neurons' activation information (on both host memory and device memory),
+* Memory for parameters (on both host memory and device memory),
+* Other memory demands.
+
+Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation，temporary variables, which are not considered currently.
+
+Reduce DataProvider Cache Memory
+++++++++++++++++++++++++++
+
+PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory:
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        Data Files -> Host Memory Pool -> PaddlePaddle Training
+    }
+
+Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle，which means a shuffle operation is needed before each data ﬁle reading process to ensure the randomness of data when try to reduce the size of the memory pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`.
+
+The Neurons Activation Memory
+++++++++++++++
+
+Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training.
+
+Two practical ways:
+
+* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself.
+* Shorten the sequence length or cut oﬀ those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM.
+
+The Parameters Memory
+++++++++
+
+The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory
+
+space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model ﬁle contains all
+
+parameters needs :code:`100M`.
+
+Some optimization algorithms such as :code:`momentum` are worth giving a shot.
+
+2. Tricks To Speed Up Training
+-------------------
+
+The training procedure of PaddlePaddle may be speed up when considering following aspects:\：
+
+* Reduce the time consumption of data loading
+* Speed up training epochs
+* Introduce more computing resources with the utilization of distribute training frameworks
+
+Reduce The Time Consumption of Data Loading
+++++++++++++++++++
+
+
+The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process.
+
+
+Accelerating Training Epochs
+++++++++++++
+
+Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`.
+Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+The configuration of this task is:
+
+..  literalinclude:: src/word2vec_config.py
+
+Introduce More Computing Resources
+++++++++++++++++++
+
+More computing resources can be introduced with following manners:
+* Single CPU platform training
+
+  * Use multi-threading by set :code:`trainer_count`。
+
+* Single GPU platform training
+
+  * Set :code:`use_gpu` to train on single GPU.
+  * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support.
+
+* Cluster Training
+
+  * Refer to :ref:`cluster_train` 。
+
+3. Assign GPU Devices
+------------------
+
+Assume a computing platform consists of 4 GPUs which serial number from 0 to 3:
+
+* Method1: specify a GPU as computing device by set:
+ `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* Method2: Assign by —gpu_id:
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training.
+------------------------------------------------------------------------
+
+Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception:
+
+* Parameters or gradients during training are oversize, which leads to float overflow during calculation.
+* The model failed to converge and diverges to a big value.
+* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication.
+
+Two ways to solve this problem:
+
+1. Set :code:`gradient_clipping_threshold` as:
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+Details can refer to example `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. Set :code:`error_clipping_threshold` as:
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+Details can refer to example `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+The main difference between these two methods are:
+
+1. They both block the gradient, but happen in different occasions，the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions.
+2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers.
+
+Moreover, Such problems may be fixed with smaller learning rates or data normalization.
+
+5.  Fetch Multi Layers’ Prediction Result With Infer Interface
+-----------------------------------------------
+
+* Join the layer to be used as :code:`output_layer` layer to the input parameters of  :code:`paddle.inference.Inference()` interface with:
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* Assign certain ﬁelds to output. Take :code:`value` as example, it can be down with following code:
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+It is important to note that:
+
+* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes.
+* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2；
+* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error:
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+The transverse of diﬀerent matrixes of multi layers mainly happens when:
+
+* Output sequence layer and non sequence layer;
+* Multiple output layers process multiple sequence with different length;
+
+Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which
+
+* The number of elements equals to the number of output layers in the network;
+* Each element in list is a result matrix of a layer, which type is numpy.ndarray;
+* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration.
+
+6.  Fetch the Output of A Certain Layer During Training
+-----------------------------------------------
+
+In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch.
+The output can be used in custom measurements in following way:
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of  :code:`paddle.layer.recurrent_group` can be fetched.
+
+7.  Fetch Parameters’ Weight and Gradient During Training
+-----------------------------------------------
+
+Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following:
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of inﬂuence on training performance. Don’t use these two functions when the training procedure cares about the performance.
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
deleted file mode 120000
index c44cd9a731bed7067cdf19aa2f714abdce6c736a..0000000000000000000000000000000000000000
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc753aa42f19631c49a451a797f28365e65ed1d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index b05b66415fbb829f471b1491b9881f65137bfe17..67c7b774e9c476a3035037a421c84ebf17a31b09 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index e5aa05c117393e81c557ba67609f787b38587efd..ae997f0805db5b01a34867c9e8b188c931721920 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -1,4 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
 API comparision between RNN and hierarchical RNN
-================================================
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
 
-TBD
diff --git a/doc/v2/howto/rnn/index_en.rst b/doc/v2/howto/rnn/index_en.rst
index e1b20ef2e7bf4c521b613e54577ff6a3feaa8936..6e8b5c61b23ca2725dc0c9761c8dd4165033973c 100644
--- a/doc/v2/howto/rnn/index_en.rst
+++ b/doc/v2/howto/rnn/index_en.rst
@@ -1,10 +1,32 @@
 RNN Models
 ==========
+Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts.
+
+In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually.
 
 ..  toctree::
   :maxdepth: 1
 
   rnn_config_en.rst
+
+Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it.
+
+..  toctree::
+  :maxdepth: 1
+  
   recurrent_group_en.md
+  
+In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively.
+
+..  toctree::
+  :maxdepth: 1
+  
   hierarchical_layer_en.rst
+
+In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test.
+
+..  toctree::
+  :maxdepth: 1
+  
   hrnn_rnn_api_compare_en.rst
+
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index b8847e4b909cbab67b2ddb6885b45b73d402de19..9f753478d8ecf12441d4b1745a9f6750a1038e31 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
     return;
   }
+  need_update_ = true;
   ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..a19558c0ae59005bee575e8c469c7f95d8780ab1
--- /dev/null
+++ b/paddle/fluid/framework/blocking_queue.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include <utility>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+class BlockingQueue {
+ public:
+  void Push(const T &item) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      q_.emplace_back(item);
+    }
+    cv_.notify_one();
+  }
+
+  template <typename U>
+  void Extend(const U &items) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      for (auto &item : items) {
+        q_.emplace_back(item);
+      }
+    }
+    cv_.notify_all();
+  }
+
+  std::deque<T> PopAll(size_t ms, bool *timeout) {
+    auto time =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
+    std::unique_lock<std::mutex> lock(mutex_);
+    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
+    std::deque<T> ret;
+    if (!*timeout) {
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [=] { return !q_.empty(); });
+    T rc(std::move(q_.front()));
+    q_.pop_front();
+    return rc;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::deque<T> q_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index e98e9d94bf71fe9ac226ab3ad7f587b37a5c6e33..bbf67f5ba92150f70cf45d49e3f4ca0a16393541 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -40,10 +40,10 @@ namespace paddle {
 namespace framework {
 
 template <typename T>
-LoDTensor *CreateVariable(Scope &scope, p::CPUPlace &place, std::string name,
-                          T value) {
+LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
+                          std::string name, T value) {
   // Create LoDTensor<int> of dim [1]
-  auto var = scope.Var(name);
+  auto var = scope->Var(name);
   auto tensor = var->GetMutable<LoDTensor>();
   tensor->Resize({1});
   T *expect = tensor->mutable_data<T>(place);
@@ -77,9 +77,9 @@ void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
   BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
   func(caseBlock, scope);
 
-  CreateVariable(*scope, *place, caseCondName, false);
-  CreateVariable(*scope, *place, caseCondXVarName, caseId);
-  CreateVariable(*scope, *place, caseVarName, caseId);
+  CreateVariable(scope, *place, caseCondName, false);
+  CreateVariable(scope, *place, caseCondXVarName, caseId);
+  CreateVariable(scope, *place, caseVarName, caseId);
 
   scope->Var("step_scope");
 
@@ -96,21 +96,21 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
                         std::string quitChanName) {
   BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
 
-  CreateVariable(*scope, *place, "whileExitCond", true);
-  CreateVariable(*scope, *place, "caseToExecute", -1);
-  CreateVariable(*scope, *place, "case1var", 0);
+  CreateVariable(scope, *place, "whileExitCond", true);
+  CreateVariable(scope, *place, "caseToExecute", -1);
+  CreateVariable(scope, *place, "case1var", 0);
 
-  CreateVariable(*scope, *place, "xtemp", 0);
+  CreateVariable(scope, *place, "xtemp", 0);
 
   // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
   // data,
   // which causes the data to be no longer accessible to do the fib calculation
   // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(*scope, *place, "fibXToSend", 0);
+  CreateVariable(scope, *place, "fibXToSend", 0);
 
-  CreateVariable(*scope, *place, "fibX", 0);
-  CreateVariable(*scope, *place, "fibY", 1);
-  CreateVariable(*scope, *place, "quitVar", 0);
+  CreateVariable(scope, *place, "fibX", 0);
+  CreateVariable(scope, *place, "fibY", 1);
+  CreateVariable(scope, *place, "quitVar", 0);
 
   BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
   std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
@@ -138,7 +138,7 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
     // Exit the while loop after we receive from quit channel.
     // We assign a false to "whileExitCond" variable, which will
     // break out of while_op loop
-    CreateVariable(*scope, *place, "whileFalse", false);
+    CreateVariable(scope, *place, "whileFalse", false);
     AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
           caseBlock);
   };
@@ -174,9 +174,9 @@ TEST(Concurrency, Go_Op) {
 
   // Create Variables, x0 will be put into channel,
   // result will be pulled from channel
-  CreateVariable(scope, place, "Status", false);
-  CreateVariable(scope, place, "x0", 99);
-  CreateVariable(scope, place, "result", 0);
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "x0", 99);
+  CreateVariable(&scope, place, "result", 0);
 
   framework::Executor executor(place);
   ProgramDesc program;
@@ -226,9 +226,9 @@ TEST(Concurrency, Select) {
   // Initialize scope variables
   p::CPUDeviceContext ctx(place);
 
-  CreateVariable(scope, place, "Status", false);
-  CreateVariable(scope, place, "result", 0);
-  CreateVariable(scope, place, "currentXFib", 0);
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "result", 0);
+  CreateVariable(&scope, place, "currentXFib", 0);
 
   framework::Executor executor(place);
   ProgramDesc program;
@@ -246,7 +246,7 @@ TEST(Concurrency, Select) {
         {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
 
   // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(scope, place, "xReceiveVar", 0);
+  CreateVariable(&scope, place, "xReceiveVar", 0);
 
   BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
   for (int i = 0; i < 10; ++i) {
@@ -264,7 +264,7 @@ TEST(Concurrency, Select) {
           goOpBlock);
   }
 
-  CreateVariable(scope, place, "quitSignal", 0);
+  CreateVariable(&scope, place, "quitSignal", 0);
   AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
         {{"Status", {"Status"}}}, {}, goOpBlock);
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a66525303da58601f85c40c41854edaf22c3d4ea..df4caa45eba2470f7528d2fbd99cca39cae0b596 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
 }
 
 TEST(Operator, CPUtoGPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  InitDevices(true);
+  paddle::framework::InitDevices(true);
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
@@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
 
   auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
   // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
-  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
+  auto* src_ptr =
+      in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
   for (int i = 0; i < 2 * 3; ++i) {
     src_ptr[i] = static_cast<float>(i);
   }
@@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
   auto* output = scope.Var("OUT1");
   cpu_op->Run(scope, cpu_place);
 
-  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
   for (int i = 0; i < 2 * 3; ++i) {
     ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
   }
@@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
   VLOG(3) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
   auto dev_ctx = pool.Get(cuda_place);
 
   paddle::framework::Tensor output_tensor;
-  TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
-             &output_tensor);
+  paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
+                                paddle::platform::CPUPlace(), *dev_ctx,
+                                &output_tensor);
 
   dev_ctx->Wait();
   float* output2_ptr = output_tensor.data<float>();
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 39222fc4ed6656dac4773c0c8829608bb954b4c6..9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cctype>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 4ca447d50a7262f44e5feb3739dce653604a6ed8..60ec60a427ba9046ce690eb75c27cd322fdd726d 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index ba15be9fc77b8405cb4bbca3f62a8be44a3f604e..06b638663dd334837a3bcb7737e507fcbc871c7a 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index dd17cac0e10db0d058d399cc725e18dcb14be507..a0d08826b854fea9256382f0e065fd59dda8c8b3 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -18,27 +18,28 @@
 #include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-  Tensor in = Tensor();
-  Tensor out = Tensor();
-  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
-  in.set_layout(DataLayout::kNHWC);
-
-  auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNHWC, LibraryType::kPlain);
-  auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNCHW, LibraryType::kPlain);
-
-  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-
-  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  paddle::framework::Tensor out = paddle::framework::Tensor();
+  in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(paddle::framework::DataLayout::kNHWC);
+
+  auto kernel_nhwc = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNHWC,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_ncwh = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kPlain);
+
+  paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
 
   TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
 
-  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index bfad9ac1e9cad1936ed961ad1da55787d2faa23e..9c277a27da5af34fc9fb18ca073e369c05ecdf22 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -63,16 +63,16 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 }
 
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var) {
+                            Variable* out_var) {
   if (in_var.IsType<LoDTensor>()) {
     auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
     auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 9ec67e6f3d6358cd658e198602f5e802a0ba4cc9..dee5d8c7c1126013742460df1d94bb364220ad09 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -35,7 +35,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
                    const Tensor& input_tensor, Tensor* out);
 
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var);
+                            Variable* out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 4c1b3e7581fe716271c62389c6053a24158913d2..2a528eb3aa562568c92059250f2c9bc5a75ec103 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -22,18 +23,21 @@ namespace paddle {
 namespace framework {
 
 inline proto::VarType::Type ToDataType(std::type_index type) {
-  using namespace paddle::framework::proto;
   if (typeid(platform::float16).hash_code() == type.hash_code()) {
     return proto::VarType::FP16;
-  } else if (typeid(float).hash_code() == type.hash_code()) {
+  } else if (typeid(const float).hash_code() == type.hash_code()) {
+    // CPPLint complains Using C-style cast.  Use static_cast<float>() instead
+    // One fix to this is to replace float with const float because
+    // typeid(T) == typeid(const T)
+    // http://en.cppreference.com/w/cpp/language/typeid
     return proto::VarType::FP32;
-  } else if (typeid(double).hash_code() == type.hash_code()) {
+  } else if (typeid(const double).hash_code() == type.hash_code()) {
     return proto::VarType::FP64;
-  } else if (typeid(int).hash_code() == type.hash_code()) {
+  } else if (typeid(const int).hash_code() == type.hash_code()) {
     return proto::VarType::INT32;
-  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
     return proto::VarType::INT64;
-  } else if (typeid(bool).hash_code() == type.hash_code()) {
+  } else if (typeid(const bool).hash_code() == type.hash_code()) {
     return proto::VarType::BOOL;
   } else {
     PADDLE_THROW("Not supported");
@@ -41,7 +45,6 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 }
 
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
   switch (type) {
     case proto::VarType::FP16:
       return typeid(platform::float16);
@@ -62,7 +65,6 @@ inline std::type_index ToTypeIndex(proto::VarType::Type type) {
 
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  using namespace paddle::framework::proto;
   switch (type) {
     case proto::VarType::FP16:
       visitor.template operator()<platform::float16>();
@@ -88,7 +90,6 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 }
 
 inline std::string DataTypeToString(const proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
   switch (type) {
     case proto::VarType::FP16:
       return "float16";
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index e75da2588d07a754783f052173c3e0dce118f1b8..1c281b03ed61ac70e16a43d75a79854bdafd8836 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index 6b9a8f5e28b372c45abfaa2c20575a55d9a9dd03..bbebea9f13fd37469a0e9b7be9719aca128f5687 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -17,43 +17,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, CPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto place = paddle::platform::CPUPlace();
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
       ptr[i] = i / 3;
     }
 
-    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
     float* out_data_float = out.data<float>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
     int64_t* out_data_int64 = out.data<int64_t>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
     bool* out_data_bool = out.data<bool>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TransDataType(kernel_fp32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TransDataType(kernel_fp64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TransDataType(kernel_int32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TransDataType(kernel_int64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TransDataType(kernel_bool, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index de389ddabcb86de0155757406a406e44086c5474..0874509a8797cd2ff1b1fcb347b4ef3b74a39047 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -18,42 +18,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, GPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto cpu_place = CPUPlace();
-  auto gpu_place = CUDAPlace(0);
-  CUDADeviceContext context(gpu_place);
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto cpu_place = paddle::platform::CPUPlace();
+  auto gpu_place = paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
 
-    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     float arr[6] = {0, 1, 2, 3, 4, 5};
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(in_ptr, arr, sizeof(arr));
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
-
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
-    float16 arr[6] = {float16(0), float16(1), float16(2),
-                      float16(3), float16(4), float16(5)};
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
+
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
+    paddle::platform::float16 arr[6] = {
+        paddle::platform::float16(0), paddle::platform::float16(1),
+        paddle::platform::float16(2), paddle::platform::float16(3),
+        paddle::platform::float16(4), paddle::platform::float16(5)};
+
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(ptr, arr, sizeof(arr));
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     float* out_data_float = out.data<float>();
@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int64_t* out_data_int64 = out.data<int64_t>();
@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     bool* out_data_bool = out.data<bool>();
@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double =
-        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    double* in_data_double = in.mutable_data<double>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 897e41f79f4e3bb9cecbe7b42fc6c4fd3401d839..96c181f983a33961e3d5fb8745740f2fdbb210de 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,29 +2,37 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
+cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
+
 if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
 else()
     set(multi_devices_graph_builder_deps)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
 endif()
+
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
-cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
-
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
+cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 7d29012380e1b1710704d71a28d21dcc3097eb51..33e02ab65a251a338225ee621ff14acbb0631992 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,95 +13,77 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
-Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
-
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  // the input may have dummy var.
-  std::vector<VarHandle *> in_var_handle;
-  for (auto *in : inputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(in);
-    if (out_handle) {
-      in_var_handle.push_back(out_handle);
-    }
-  }
-  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
-                    "The number of input should be one.");
-
-  // the output may have dummy var.
-  std::vector<VarHandle *> out_var_handles;
-  for (auto *out : outputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
-      out_var_handles.push_back(out_handle);
-    }
+  // the input and output may have dummy var.
+  VarHandle *in_var_handle;
+
+  {
+    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+                      "The number of input should be one.");
+    in_var_handle = in_var_handles[0];
   }
 
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
   PADDLE_ENFORCE_EQ(
       out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // Wait input done, this Wait is asynchronous operation
-  auto &in_place = in_var_handle[0]->place_;
-  if (in_var_handle[0]->generated_op_) {
-    for (auto *out : out_var_handles) {
-      auto &out_p = out->place_;
-      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
-    }
+  // Wait input done, this Wait is asynchronous operation platform::Place
+  // &in_place;
+  WaitInputVarGenerated(*in_var_handle);
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
   }
 
-  //
-  auto in_scope_idx = in_var_handle[0]->scope_idx_;
-  auto in_var =
-      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
-  Tensor *in_tensor = GetTensorFromVar(in_var);
+  auto *in_var =
+      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(in_var);
+
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
   for (auto *out : out_var_handles) {
-    auto &out_p = out->place_;
-    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+    if (*out == *in_var_handle) {
+      continue;
+    }
 
-    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+    auto &out_p = out->place_;
+    auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
                       "Places must be all on CPU or all on CUDA.");
 
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto &in_sr = in_var->Get<framework::SelectedRows>();
-      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
-      if (&in_sr == out_sr) continue;
-      out_sr->set_height(in_sr.height());
-      out_sr->set_rows(in_sr.rows());
-      out_sr->mutable_value()->Resize(in_sr.value().dims());
-      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto in_lod = in_var->Get<framework::LoDTensor>();
-      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
-      if (&in_lod == out_lod) continue;
-      out_lod->set_lod(in_lod.lod());
-      out_lod->Resize(in_lod.dims());
-      out_lod->mutable_data(out_p, in_lod.type());
-    } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
-    }
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
+                                                            in_tensor.type());
 
-    Tensor *out_tensor = GetTensorFromVar(out_var);
-    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
-                                  out_tensor);
+    auto dev_ctx = dev_ctxes_.at(out_p);
+    RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
+      paddle::framework::TensorCopy(
+          in_tensor, out_p, *(dev_ctx),
+          &VariableVisitor::GetMutableTensor(out_var));
+    });
+  }
+}
+
+void BroadcastOpHandle::WaitInputVarGenerated(const VarHandle &in_var) {
+  if (in_var.generated_op_) {
+    for (auto &pair : dev_ctxes_) {
+      in_var.generated_op_->Wait(pair.second);
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index b3292422522b64a38a50f39f04e6f0d2e15492dd..92420f10ac5972b7924d83b43bb28234079e5072 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-
+ public:
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 
@@ -41,8 +39,12 @@ struct BroadcastOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-};
+  void WaitInputVarGenerated(const VarHandle &in_var);
 
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index bcd61335be0f7fe64563ee65daaf9de0760c9b1a..3f2dcde3e9597287d72046dd4f8b07faab1ede25 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
 struct TestBroadcastOpHandle {
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
   std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
   Scope g_scope_;
   std::unique_ptr<OpHandleBase> op_handle_;
   std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -72,11 +73,17 @@ struct TestBroadcastOpHandle {
   void InitBroadcastOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
     }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("input");
 
     op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+
     auto* in_var_handle =
         new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
     vars_.emplace_back(in_var_handle);
@@ -90,7 +97,7 @@ struct TestBroadcastOpHandle {
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
@@ -105,7 +112,8 @@ struct TestBroadcastOpHandle {
   }
 
   void TestBroadcastLodTensor(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
     auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
     in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
@@ -117,6 +125,7 @@ struct TestBroadcastOpHandle {
     paddle::framework::TensorFromVector<float>(
         send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
     in_lod_tensor->set_lod(lod);
+    in_lod_tensor->Resize(kDims);
 
     op_handle_->Run(false);
 
@@ -124,7 +133,8 @@ struct TestBroadcastOpHandle {
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
       auto out_tensor = out_var->Get<f::LoDTensor>();
       PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
@@ -139,7 +149,8 @@ struct TestBroadcastOpHandle {
   }
 
   void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
     auto value = in_selected_rows->mutable_value();
     value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -162,7 +173,8 @@ struct TestBroadcastOpHandle {
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
       auto& out_select_rows = out_var->Get<f::SelectedRows>();
       auto rt = out_select_rows.value();
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index ff6d91c1dafb0ab4cabb1646cc333e19a89eb812..7ff0efe09387b7e5d7cfe0dfe5e129ca9914d90b 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -28,8 +28,8 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   auto *cur_ctx = dev_ctxes_[place_];
   for (auto *in : inputs_) {
-    bool need_wait =
-        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    bool need_wait = in->generated_op_ &&
+                     in->generated_op_->DeviceContext(place_) != cur_ctx;
     if (need_wait) {
       in->generated_op_->Wait(cur_ctx);
     }
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index d6d2d731ca80a0fbc0a2a34027b5b7c3c1977c07..c363b973d9abbae6bea76c2458fbe82a37a342ca 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -24,10 +27,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  Scope *scope_;
-  platform::Place place_;
-
+ public:
   ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
                       platform::Place place);
 
@@ -35,6 +35,11 @@ struct ComputationOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/container_cast.h b/paddle/fluid/framework/details/container_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ae78dc45c2a885f98315a21f1d5558725bca3
--- /dev/null
+++ b/paddle/fluid/framework/details/container_cast.h
@@ -0,0 +1,40 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename ResultType, typename ElemType>
+std::vector<ResultType*> DynamicCast(const std::vector<ElemType*>& container) {
+  static_assert(std::is_base_of<ElemType, ResultType>::value,
+                "ElementType must be a base class of ResultType");
+  std::vector<ResultType*> res;
+  for (auto* ptr : container) {
+    auto* derived = dynamic_cast<ResultType*>(ptr);
+    if (derived) {
+      res.emplace_back(derived);
+    }
+  }
+  return res;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
index 69bcea625288eba897e761a1d634f19c41dc0f79..21f75957be5f33f3dfc09c41fa9a1e1ca590f99e 100644
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 
 namespace paddle {
 namespace framework {
@@ -23,7 +23,7 @@ namespace details {
 // Change it to thread safe flags if needed.
 class ThreadUnsafeOwnershipFlags {
  public:
-  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
 
   ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
   ThreadUnsafeOwnershipFlags& operator=(
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index e3e7c55d153aec8ce9c25c962821b266eaa84fe4..423449abff97dbf70d81314f852d9135e25f243f 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
     auto *var = static_cast<VarHandle *>(input);
     var->generated_op_->Wait(cpu_ctx);
   }
-
   tensors_.resize(inputs_.size());
-  auto *var = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var->name_;
+  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
+  auto &var_name = var_handle->name_;
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(kLocalExecScopeName)
-                  ->Get<Scope *>()
-                  ->FindVar(var_name)
-                  ->Get<framework::LoDTensor>();
-    if (platform::is_gpu_place(var->place_)) {
+    auto *var =
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
+                            var_name);
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
-      dev_ctxes_[t.place()]->Wait();
+      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 904b2d669f8b156b99197afb0155380d1170a68b..b49f3df338dc11310a4a0c27c8aaae3602373fcc 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,11 +27,7 @@ namespace framework {
 namespace details {
 
 struct FetchOpHandle : public OpHandleBase {
-  FeedFetchList *data_;
-  size_t offset_;
-  std::vector<Scope *> *local_scopes_;
-  std::vector<LoDTensor> tensors_;
-
+ public:
   FetchOpHandle(FeedFetchList *data, size_t offset,
                 std::vector<Scope *> *local_scopes);
 
@@ -42,6 +41,12 @@ struct FetchOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 8dd85be567d33991ac003707fec939a61a2d0962..3ed7723919fc3a547b15c28b846de758a8155e66 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
@@ -23,46 +25,40 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
     : local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
-  // the input may have dummy var.
-  std::vector<VarHandle *> in_var_handles;
-  for (auto *in : inputs_) {
-    auto *in_handle = dynamic_cast<VarHandle *>(in);
-    if (in_handle) {
-      in_var_handles.push_back(in_handle);
-    }
-  }
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // the output may have dummy var.
-  std::vector<VarHandle *> out_var_handles;
-  for (auto *out : outputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
-      out_var_handles.push_back(out_handle);
-    }
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
   }
-  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                    "The number of output should be one.");
 
-  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
-  auto pre_in_var =
-      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
-  auto pre_place = in_0_handle->place_;
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
 
+  auto in_0_handle = in_var_handles[0];
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
   PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                  "Currently, gather_op only can gather SelectedRows.");
 
-  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+  auto pre_place = in_0_handle->place_;
+  PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
                     "The place of input and output should be the same.");
 
   // Wait input done, this Wait is asynchronous operation
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      in->generated_op_->Wait(dev_ctxes_[in->place_]);
-    }
-  }
+  WaitInputVarGenerated(in_var_handles);
 
   std::vector<int64_t> out_rows;
   std::vector<Tensor> in_tensors;
@@ -70,34 +66,32 @@ void GatherOpHandle::RunImpl() {
 
   auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
   // gather the inputs
-  for (auto *in : in_var_handles) {
-    auto in_handle = static_cast<VarHandle *>(in);
+  for (auto *in_handle : in_var_handles) {
     auto in_p = in_handle->place_;
     in_places.push_back(in_p);
     PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                       "Places must be all on CPU or all on CUDA.");
-    auto in_var =
-        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    auto *in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
     auto &in_sr = in_var->Get<framework::SelectedRows>();
 
     PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                       "The type of input is not consistent.");
     PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
                       "The height of inputs is not consistent.");
-    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(),
                       "The dims of inputs is not consistent.");
 
-    auto in_sr_rows = in_sr.rows();
+    auto &in_sr_rows = in_sr.rows();
     out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
 
     in_tensors.emplace_back(in_sr.value());
   }
 
   // write the output
-  auto &out_place = out_var_handles[0]->place_;
-  auto out_scope_idx = out_var_handles[0]->scope_idx_;
-  auto out_var =
-      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+  auto &out_place = out_var_handle->place_;
+  auto out_scope_idx = out_var_handle->scope_idx_;
+  auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
 
   auto out = out_var->GetMutable<framework::SelectedRows>();
   out->set_height(pre_in.height());
@@ -110,13 +104,27 @@ void GatherOpHandle::RunImpl() {
   Tensor *out_tensor = out->mutable_value();
 
   // copy
-  int s = 0, e = 0;
-  for (size_t j = 0; j < in_tensors.size(); ++j) {
-    e += in_tensors[j].dims()[0];
-    auto sub_out = out_tensor->Slice(s, e);
-    paddle::framework::TensorCopy(in_tensors[j], out_place,
-                                  *(dev_ctxes_[in_places[j]]), &sub_out);
-    s = e;
+  auto dev_ctx = dev_ctxes_[out_place];
+  RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] {
+    int s = 0, e = 0;
+    for (size_t j = 0; j < in_tensors.size(); ++j) {
+      e += in_tensors[j].dims()[0];
+      auto sub_out = out_tensor->Slice(s, e);
+      paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx),
+                                    &sub_out);
+      s = e;
+    }
+  });
+}
+
+void GatherOpHandle::WaitInputVarGenerated(
+    const std::vector<VarHandle *> &in_var_handles) {
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      for (auto pair : dev_ctxes_) {
+        in->generated_op_->Wait(pair.second);
+      }
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index 6c0231f642c05e6b558b7e2518a15e08c816fe4b..c394dd7a14b07cb956aa1aedfc0df4fa25744dd7 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 
 struct GatherOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-
+ public:
   GatherOpHandle(const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places);
 
@@ -41,6 +39,11 @@ struct GatherOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 2da8c89d2df73215b748f102d9bbfc5b742cf97f..3cce2cc1640b3866130126424ff8fef18b8befc6 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
 struct TestGatherOpHandle {
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
   std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
   Scope g_scope_;
   std::unique_ptr<OpHandleBase> op_handle_;
   std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -71,14 +72,19 @@ struct TestGatherOpHandle {
   void InitGatherOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
     }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("out");
 
     op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
@@ -115,7 +121,8 @@ struct TestGatherOpHandle {
 
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
-      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -128,10 +135,11 @@ struct TestGatherOpHandle {
       value->Resize(kDims);
     }
 
-    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
-    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
 
     out_selected_rows->mutable_value()->ShareDataWith(
@@ -155,7 +163,8 @@ struct TestGatherOpHandle {
     f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
     float* ct = result_tensor.data<float>();
 
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
+    for (int64_t j = 0;
+         j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
   }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4d76dbf7f6ffcf6c82ebf7defd9334bbe64a451c..3413467b149539bcff42d78a9a6fe315d6558bb4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
+    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
     platform::NCCLContextMap *nccl_ctxs)
     : loss_var_name_(loss_var_name),
       places_(places),
@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes)
+    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes) {
@@ -53,6 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
+  skip_scale_loss_ = skip_scale_loss;
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -60,7 +61,8 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
                                                 const platform::Place &p,
                                                 const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
 
   auto var_names = op.InputArgumentNames();
 
@@ -76,6 +78,33 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
   }
 }
 
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
+                                            OpDesc *send_op) const {
+  if (send_op == nullptr) {
+    return false;
+  }
+
+  auto checker = [&](const std::vector<std::string> opvars,
+                     const std::vector<std::string> sendvars) -> bool {
+    bool is_dist_train_op = false;
+    for (auto &var : opvars) {
+      if (var.find(".block") != std::string::npos &&
+          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
+        is_dist_train_op = true;
+        break;
+      }
+    }
+    return is_dist_train_op;
+  };
+
+  if (op.Type() == "split") {
+    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
+  } else if (op.Type() == "concat") {
+    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
+  }
+  return false;
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
@@ -87,103 +116,40 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
       places_.size());
 
-  bool is_forwarding = true;
+  // Find "send" op first for split is in front of send.
+  OpDesc *send_op = nullptr;
   for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
-    if (!is_forwarding) {
-      // FIXME(yy): Do not hard code like this
-      if (op->OutputArgumentNames().size() == 1 &&
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
-        continue;  // Drop fill 1. for backward coeff;
-      }
-    }
-
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
+    if (op->Type() == "send") {
+      send_op = op;
+      break;
     }
+  }
 
-    for (size_t i = 0; i < places_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
-      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, *op, p, i);
-
-      auto var_names = op->OutputArgumentNames();
-
-      if (is_forwarding) {
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
+  bool is_forwarding = true;
+  for (auto *op : program.Block(0).AllOps()) {
+    if (op->Type() == "send") {
+      // append send op if program is distributed trainer main program.
+      // always use the first device
+      CreateSendOp(&result, *op);
+    } else if (IsDistTrainOp(*op, send_op)) {
+      CreateComputationalOps(&result, *op, 1);
+    } else if (IsScaleLossOp(*op)) {
+      if (!skip_scale_loss_) {
+        CreateScaleLossGradOp(&result);
       }
-    }
-
-    if (change_forward) {
       is_forwarding = false;
-    }
-
-    if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once. But there are no
-      // other cases, for example, we need to adjust the gradient according to
-      // the input when we get the gradient, which is not considered at present.
-      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0 &&
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-
-            auto var = new VarHandle(vars.size() - 1, i, og, p);
-            vars.emplace_back(var);
-            op_handle->AddOutput(var);
+    } else {
+      CreateComputationalOps(&result, *op, places_.size());
+      if (!is_forwarding) {
+        // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
+        // present.
+        for (auto &og : op->OutputArgumentNames()) {
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
+            InsertNCCLAllReduceOp(&result, og);
           }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
         }
       }
     }
@@ -207,7 +173,96 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
 
   return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op,
+                                                     size_t num_places) const {
+  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index f1518d75b421006db6311c3b0f602e47000ab381..dc3da70eda2abaa1a312c25aedf94fa7e427c78a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -34,12 +34,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
+                          bool skip_scale_loss,
                           platform::NCCLContextMap *nccl_ctxs);
 #else
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes);
+                          const std::vector<Scope *> &local_scopes,
+                          bool skip_scale_loss);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -57,6 +59,24 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
+  bool skip_scale_loss_;
+
+  bool IsScaleLossOp(const OpDesc &op) const;
+
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+
+  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
+
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+                              size_t num_places) const;
+
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index e587210b357ea6caa3272903d8aa6b3e4b2e8228..b055bb48f608c9fd9cc671d175cb463d25dc489b 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-
 #include <algorithm>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
 
 namespace paddle {
 namespace framework {
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
   }
 }
 
-struct ReduceLoDTensor {
-  const std::vector<LoDTensor> &src_tensors_;
-  LoDTensor &dst_tensor_;
-
-  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
-      : src_tensors_(src), dst_tensor_(*dst) {}
-
-  template <typename T>
-  void operator()() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
-    auto &t0 = src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
-    dst_tensor_.Resize(t0.dims());
-    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
-      auto &t = src_tensors_[i];
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
-      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
-
 void NCCLAllReduceOpHandle::RunImpl() {
   if (inputs_.size() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -69,21 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
     int dtype = -1;
     size_t numel = 0;
 
-    std::vector<LoDTensor> lod_tensors;
+    std::vector<const LoDTensor *> lod_tensors;
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
       auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
       auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
-      lod_tensors.emplace_back(lod_tensor);
+      lod_tensors.emplace_back(&lod_tensor);
     }
 
-    if (platform::is_gpu_place(lod_tensors[0].place())) {
+    if (platform::is_gpu_place(lod_tensors[0]->place())) {
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = places_[i];
-        auto &lod_tensor = lod_tensors[i];
+        auto &lod_tensor = *lod_tensors[i];
         void *buffer = const_cast<void *>(lod_tensor.data<void>());
 
         if (dtype == -1) {
@@ -119,7 +93,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
 
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &scope =
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index ad14a3c5cb4625fa121cad2daed389c441e78771..a0c321843e3fc5abcbd1ef2ce2e153250269aa7d 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -27,10 +27,6 @@ namespace framework {
 namespace details {
 
 struct NCCLAllReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
-
   NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
                         const platform::NCCLContextMap &ctxs);
@@ -43,6 +39,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index a9a6c8d39cf8741f7d9c91579a650ad742cec381..00f213f3ed294adcce7c540e3ff346de8e2be7fb 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -27,28 +27,15 @@ namespace details {
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
 class OpHandleBase {
- private:
-  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
-
  public:
-  std::vector<VarHandleBase *> inputs_;
-  std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
-
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
-#endif
-
   OpHandleBase() {}
 
+  virtual ~OpHandleBase();
+
   std::string DebugString() const;
 
   virtual std::string Name() const = 0;
 
-  virtual ~OpHandleBase();
-
   void Run(bool use_event);
 
   virtual void Wait(platform::DeviceContext *waited_dev);
@@ -61,6 +48,18 @@ class OpHandleBase {
   // will likely block other computations.
   virtual bool IsMultiDeviceTransfer() { return false; }
 
+  const platform::DeviceContext *DeviceContext(platform::Place place) {
+    return dev_ctxes_[place];
+  }
+
+  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
+    dev_ctxes_[place] = ctx_;
+  }
+
+  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+
+  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
@@ -68,6 +67,18 @@ class OpHandleBase {
                          const std::function<void()> &callback);
 
   virtual void RunImpl() = 0;
+
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index d73604ad185a66ade0168f585d1951d0d7d4a5f9..06603db31e0092382c0cc05482a038473d647ef1 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b95a284990da8f9b7c16d6e4221eb1ed061f74b
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -0,0 +1,94 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceLoDTensor {
+  const std::vector<const LoDTensor *> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = *src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = *src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
+inline void GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selecte_rows_,
+    const std::vector<platform::Place> &in_places,
+    const std::unordered_map<platform::Place, platform::DeviceContext *,
+                             platform::PlaceHash> &dev_ctxes,
+    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+
+  std::vector<Tensor> in_tensors;
+  std::vector<int64_t> out_rows;
+
+  for (auto in_sr_ptr : src_selecte_rows_) {
+    auto &in_sr = *in_sr_ptr;
+    in_tensors.emplace_back(in_sr.value());
+    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+  }
+
+  auto &pre_in = src_selecte_rows_[0];
+
+  auto &dst_tensor = *dst_selecte_rows;
+  dst_tensor.set_height(pre_in->height());
+  dst_tensor.set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in->GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  dst_tensor.mutable_value()->Resize(out_dim);
+  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+  Tensor *out_tensor = dst_tensor.mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes.at(in_places[j])), &sub_out);
+    s = e;
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..409e8f72b841de03dcb50e62de447ae9895df2c0
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -0,0 +1,169 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void ReduceOpHandle::RunImpl() {
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
+  }
+
+  auto in_0_handle = in_var_handles[0];
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  // Wait input done, this Wait is asynchronous operation
+  WaitInputVarGenerated(in_var_handles);
+  auto pre_place = in_0_handle->place_;
+  std::vector<platform::Place> in_places;
+  auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
+  for (auto *in_handle : in_var_handles) {
+    auto in_p = in_handle->place_;
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    in_places.emplace_back(in_p);
+
+    auto in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+
+    auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
+    PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
+                      "The type of input is not consistent.");
+  }
+
+  auto out_var =
+      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  if (pre_in_var->IsType<framework::SelectedRows>()) {
+    std::vector<const SelectedRows *> in_selected_rows =
+        GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+
+    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                       out_var_handle->place_,
+                       out_var->GetMutable<framework::SelectedRows>());
+  } else {
+    std::vector<const LoDTensor *> lod_tensors =
+        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
+    if (paddle::platform::is_cpu_place(pre_place)) {
+      ReduceLoDTensor func(lod_tensors,
+                           out_var->GetMutable<framework::LoDTensor>());
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+    } else if (paddle::platform::is_gpu_place(pre_place)) {
+#ifdef PADDLE_WITH_CUDA
+      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
+      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
+      VariableVisitor::GetMutableTensor(out_var).mutable_data(
+          out_var_handle->place_, pre_in.type());
+
+      auto out_p = out_var_handle->place_;
+      int root = boost::get<platform::CUDAPlace>(out_p).device;
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < var_scopes.size(); ++i) {
+        auto &p = in_places[i];
+        auto &lod_tensor = *lod_tensors[i];
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *recvbuffer = nullptr;
+        if (root == dev_id) {
+          recvbuffer =
+              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
+                  out_var_handle->place_);
+        }
+
+        int type = platform::ToNCCLDataType(lod_tensor.type());
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
+              static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream));
+        });
+      }
+
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+    }
+  }
+}
+
+template <typename T>
+std::vector<const T *> ReduceOpHandle::GetInputValues(
+    const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<const Scope *> &var_scopes) const {
+  std::vector<const T *> in_selected_rows;
+  for (auto *in_handle : in_var_handles) {
+    auto &in_sr = var_scopes.at(in_handle->scope_idx_)
+                      ->FindVar(in_handle->name_)
+                      ->Get<T>();
+    in_selected_rows.emplace_back(&in_sr);
+  }
+  return in_selected_rows;
+}
+
+void ReduceOpHandle::WaitInputVarGenerated(
+    const std::vector<VarHandle *> &in_var_handles) {
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      for (auto pair : dev_ctxes_) {
+        in->generated_op_->Wait(pair.second);
+      }
+    }
+  }
+}
+
+std::string ReduceOpHandle::Name() const { return "reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..9746b3bdbde14d24a83a27a593c5f1ebfec201ff
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+
+  template <typename T>
+  std::vector<const T *> GetInputValues(
+      const std::vector<VarHandle *> &in_var_handles,
+      const std::vector<const Scope *> &var_scopes) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c17aabee53680fba10eac289cf8f8bd5f7d419e8
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -0,0 +1,286 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestReduceOpHandle {
+  bool use_gpu_;
+  Scope g_scope_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<Scope *> param_scopes_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitReduceOp(size_t out_scope_idx) {
+    // init scope
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope &local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope *>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[out_scope_idx]->Var("out");
+
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
+
+    // init op handle
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      }
+      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      in_var_handle->generated_op_ = nullptr;
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *in_dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    auto *out_var_handle =
+        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    vars_.emplace_back(out_var_handle);
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestReduceSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+
+  void TestReduceLodTensors(size_t output_scope_idx) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+      in_lod_tensor->set_lod(lod);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_lodtensor = in_var->Get<f::LoDTensor>();
+
+    out_lodtensor->ShareDataWith(in_lodtensor);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &rt = out_var->Get<f::LoDTensor>();
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
+    }
+  }
+};
+
+TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+TEST(ReduceTester, TestCPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+
+TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+
+TEST(ReduceTester, TestGPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index ab7353a4fc56bebfe04696efd838dc4559218058..d93d599d46f130cf98f39f15697ce994a31e20c3 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -23,10 +25,6 @@ namespace framework {
 namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  float coeff_;
-  Scope *scope_;
-  platform::Place place_;
-
   ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
                         platform::DeviceContext *context);
 
@@ -36,6 +34,11 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index 549b9d9abbe5bfd17df3509e0442bfa19b7ecd61..0763f92171e7813ec0ee8ca4f3aa42b76205130a 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,10 @@ void SendOpHandle::RunImpl() {
     }
     in->generated_op_->Wait(dev_ctxes_[p]);
   }
-  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
+  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
+  // lock.
+  op_->Run(*tmp_scope, place_);
 }
 
 std::string SendOpHandle::Name() const { return "send"; }
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index 173f9d726145aeb9e85cc0fb9056eb57bf484098..2f78811fad50642b5e45776c41910df6f4cc48f6 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -28,10 +28,6 @@ namespace framework {
 namespace details {
 
 struct SendOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  const Scope* local_scope_;
-  const platform::Place& place_;
-
   SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
                const platform::Place& place);
 
@@ -43,6 +39,11 @@ struct SendOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 25e8c77bb489546092b2a93e052da7dd0dd5edf4..6a567527550883add08031e50aa8de2b204cf13d 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -117,12 +117,12 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
-    for (auto in : op->inputs_) {
+    for (auto in : op->Inputs()) {
       std::string var_name = "var_" + std::to_string(vars[in]);
       sout << var_name << " -> " << op_name << std::endl;
     }
 
-    for (auto out : op->outputs_) {
+    for (auto out : op->Outputs()) {
       std::string var_name = "var_" + std::to_string(vars[out]);
       sout << op_name << " -> " << var_name << std::endl;
     }
@@ -133,7 +133,7 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
 
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
-    if (!op->outputs_.empty()) {
+    if (!op->Outputs().empty()) {
       continue;
     }
     auto *dummy_leaf = new DummyVarHandle();
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 3d2bd633afff1d453d00faeca3b3dcf77f8dd5d7..5e6ed5cb7cdc534332d402380458f39aecd841b8 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -53,7 +53,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   };
 
   auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
-    pending_ops.insert({&op_instance, op_instance.inputs_.size()});
+    pending_ops.insert({&op_instance, op_instance.Inputs().size()});
   };
 
   // Transform SSAGraph to pending_ops & pending_vars
@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
 
   for (auto &op : graph_->ops_) {
-    if (op->inputs_.empty()) {  // Special case, Op has no input.
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
       InsertPendingOp(*op);
@@ -99,7 +99,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     fetch_ops.emplace_back(op);
 
     for (auto &p : places_) {
-      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
     }
 
     for (auto *var : vars) {
@@ -128,7 +128,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     //
     // NOTE: DelayedOps have a lower priority. It will be scheduled after all
     // ready_ops have been performed.
-    if (ready_ops.empty() && allow_op_delay_) {
+    if (ready_ops.empty() && allow_op_delay_ && running_ops_ == 0) {
       run_all_ops(delayed_ops);
     } else {
       run_all_ops(ready_ops);
@@ -140,7 +140,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
     if (timeout) {
       if (exception_) {
-        throw * exception_;
+        auto exp = *exception_;
+        exception_.reset();
+        throw exp;
       } else {
         continue;
       }
@@ -180,7 +182,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       op->Run(use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
-      ready_var_q->Extend(op->outputs_);
+      ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index d70bbd4ef0eb02d1b473bf88e526996819aec5f9..d089b79d91327e38408439a8019ec5189ff6d189 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -22,6 +22,7 @@
 
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 
 namespace paddle {
@@ -30,46 +31,6 @@ class Scope;
 
 namespace details {
 
-template <typename T>
-class BlockingQueue {
- public:
-  void Push(const T &item) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      q_.emplace_back(item);
-    }
-    cv_.notify_one();
-  }
-
-  template <typename U>
-  void Extend(const U &items) {
-    {
-      std::lock_guard<std::mutex> g(mutex_);
-      for (auto &item : items) {
-        q_.emplace_back(item);
-      }
-    }
-    cv_.notify_all();
-  }
-
-  std::deque<T> PopAll(size_t ms, bool *timeout) {
-    auto time =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
-    std::unique_lock<std::mutex> lock(mutex_);
-    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
-    std::deque<T> ret;
-    if (!*timeout) {
-      std::swap(ret, q_);
-    }
-    return ret;
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::deque<T> q_;
-};
-
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 2b887c67e6fc6ea78e42fbb9fd170f740db27d97..9f7fd69e64fe9d7ef0bf3037bea7f686cb2eee0b 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -61,6 +61,11 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
+
+  bool operator==(const VarHandle& o) const {
+    return o.generated_op_ == generated_op_ && o.name_ == name_ &&
+           o.scope_idx_ == scope_idx_;
+  }
 };
 
 // Dummy Variable. It is used to represent dependencies between operators
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10bac0fae9504215fab11dd8cca7c278feaa4bda
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -0,0 +1,93 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename Func>
+static void VisitVariable(Variable* var, Func* func) {
+  if (var->IsType<LoDTensor>()) {
+    (*func)(var->GetMutable<LoDTensor>());
+  } else if (var->IsType<SelectedRows>()) {
+    (*func)(var->GetMutable<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var->Type().name());
+  }
+}
+
+template <typename Func>
+static void VisitVariable(const Variable& var, Func* func) {
+  if (var.IsType<LoDTensor>()) {
+    (*func)(var.Get<LoDTensor>());
+  } else if (var.IsType<SelectedRows>()) {
+    (*func)(var.Get<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var.Type().name());
+  }
+}
+
+struct TensorVisitor {
+  Tensor* result_{nullptr};
+
+  void operator()(LoDTensor* tensor) { result_ = tensor; }
+
+  void operator()(SelectedRows* selected_rows) {
+    result_ = selected_rows->mutable_value();
+  }
+
+  template <typename T>
+  void operator()() {
+    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+  }
+};
+
+Tensor& VariableVisitor::GetMutableTensor(Variable* var) {
+  TensorVisitor vistor;
+  VisitVariable(var, &vistor);
+  return *vistor.result_;
+}
+
+struct ShareDimsAndLoDVisitor {
+  Variable* trg_;
+  void operator()(const LoDTensor& val) {
+    auto* tensor = trg_->GetMutable<LoDTensor>();
+    tensor->set_layout(val.layout());
+    tensor->set_lod(val.lod());
+    tensor->Resize(val.dims());
+  }
+
+  void operator()(const SelectedRows& val) {
+    auto* selected_rows = trg_->GetMutable<SelectedRows>();
+    selected_rows->set_rows(val.rows());
+    selected_rows->set_height(val.height());
+    selected_rows->mutable_value()->Resize(val.value().dims());
+  }
+
+  template <typename T>
+  void operator()(const T&) {
+    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
+                   typeid(T).name());
+  }
+};
+
+void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
+  ShareDimsAndLoDVisitor visitor{trg};
+  VisitVariable(src, &visitor);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..67baa1895e4513738fa73d49c46660da92279b9d
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -0,0 +1,33 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class VariableVisitor {
+ public:
+  static Tensor &GetMutableTensor(Variable *var);
+
+  static void ShareDimsAndLoD(const Variable &src, Variable *trg);
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index a8c3e227db3f8f3781d0acd5e233d7bea1123df1..8e1f93c5ebd448903d70f9668539e077875836e4 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
+#include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/variable.h"
 
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index d6130f421e122047c2f4ed315e6f2fb7484cda1a..7f504bfd232862c014cb59b6e8301eec74e0351f 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
 
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index b0d1e9f0a7074da33af1cd279ab913ab604150b1..fae792ad9fa766f456ed706cc9adeb4e34d20123 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -22,7 +21,8 @@ namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
 
-static const std::string kFeedOpType = "feed";
-static const std::string kFetchOpType = "fetch";
+static const char kFeedOpType[] = "feed";
+static const char kFetchOpType[] = "fetch";
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index cf697187d6225f3a1d2506120eebe14d4a41dff9..b4d3fa25c35fbf25b3d2fdd9fa1045dda0f773ec 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
                       " for input argument with a list of variables, "
                       " drop_empty_grad is not allowed because it makes"
                       " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
-                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " ambiguous."
                       " Op type %s",
                       fwd_op_.Type());
 
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
index 75c557fa4243f4bd984314fac298e9335108e7a9..b30f276b4b7c61fda1b40273ce6ccfa19738da41 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <stdexcept>
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
@@ -28,7 +29,7 @@ namespace framework {
 std::once_flag gflags_init_flag;
 std::once_flag p2p_init_flag;
 
-void InitGflags(std::vector<std::string> &argv) {
+void InitGflags(std::vector<std::string> argv) {
   std::call_once(gflags_init_flag, [&]() {
     int argc = argv.size();
     char **arr = new char *[argv.size()];
@@ -65,7 +66,7 @@ void InitP2P(int count) {
 }
 
 void InitDevices(bool init_p2p) {
-  /*Init all avaiable devices by default */
+  /*Init all available devices by default */
 
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/framework/init.h
index fae98a60b5111465375404609905980177f613b1..1155ca36049dc66e7ee40e8eca87285d7a728299 100644
--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -20,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void InitGflags(std::vector<std::string> &argv);
+void InitGflags(std::vector<std::string> argv);
 
 void InitGLOG(const std::string &prog_name);
 
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index ea538731b469901a3357d624c5bb0fddc4058488..904cc013012b9c3ea8054816446844f6d2cda26b 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <cctype>
+#include <string>
 
 namespace paddle {
 namespace framework {
@@ -67,5 +68,5 @@ inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
   return out;
 }
 
-}  // namespace
-}  // framework
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
index ef83e71160e0e52071b033ea8b86e6da91bbfad2..8c6e8b0c66ead96f0e53b56ee951887730b0d77f 100644
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iosfwd>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index d99a15547b77a0e0d71b14bd1c798cd1485720b0..29b3396bc9854cd3d3ac8d4283f48019c9a9c55f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <initializer_list>
 #include <vector>
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index eabfdc11a8b314c4af9626ded3edd1bcba212de1..076c45713015797f86a3611dd333132bae40044d 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+#include <algorithm>
 #include <functional>
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <string>
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -203,8 +205,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   need_update_ = true;
 }
 
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
-  this->attrs_[name] = &block;
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
+  this->attrs_[name] = block;
   need_update_ = true;
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b..3ee36a47c156da67a9ff70852665fbbd464bea17 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/attribute.h"
@@ -73,7 +74,7 @@ class OpDesc {
 
   void SetAttr(const std::string &name, const Attribute &v);
 
-  void SetBlockAttr(const std::string &name, BlockDesc &block);
+  void SetBlockAttr(const std::string &name, BlockDesc *block);
 
   Attribute GetAttr(const std::string &name) const;
 
@@ -119,7 +120,7 @@ class OpDesc {
 
   void InferVarType(BlockDesc *block) const;
 
-  void MarkAsTarget() { desc_.set_is_target(true); }
+  void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index 3a1036742c206961fe52660106ae947153e9b244..fab20d75f5a45257f243333c1998d7b2549a25f9 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3116b03d0433ddf98613796b272238e5fe72ce6a..c479d7617cfa34cd381d84d15d5e214d57af52d0 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -12,6 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index cf56b0fa1894374956b3011c88bc70acdba1e464..0beb57ce1609d2e90c05d3255647bd321bc1f6a9 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index f1424f13b445155fe4f28732408a2445ab1aa9b7..748317438b44bc4af84f13b25f8e4f88386388fb 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
@@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                          \
   }
 
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with drop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
-
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
   REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 0d791c8583537d410b838c1662755938353052a9..6dc4cf261bad3c004aa53fba5502fe166e3a47f7 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -202,8 +202,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
 };
 
 TEST(OperatorRegistrar, Test) {
-  using namespace paddle::framework;
-  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+  paddle::framework::OperatorRegistrar<
+      CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
+      reg("cos");
 }
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f97bd0827428feeb590fcad16c48f3461517a646..32576423a62a1a12f085d565e7ff267145bf979c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -171,17 +171,6 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   return ss.str();
 }
 
-void OperatorBase::Rename(const std::string& old_name,
-                          const std::string& new_name) {
-  for (auto& input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto& output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-}
-
 OperatorBase::OperatorBase(const std::string& type,
                            const VariableNameMap& inputs,
                            const VariableNameMap& outputs,
@@ -327,7 +316,6 @@ bool OpSupportGPU(const std::string& op_type) {
   auto it = all_kernels.find(op_type);
   if (it == all_kernels.end()) {
     // All control operator must support GPU
-
     return true;
   }
   for (auto& kern_pair : it->second) {
@@ -554,7 +542,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             std::shared_ptr<Tensor> out(new Tensor);
             DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
                           out.get());
-            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
+            CopyVariableWithTensor(*var, *(out.get()), trans_var);
           }
         }
       }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b7a7c69b4c8493f945926c75797c49d327a3197e..826cc57b725ab4b52e5d67ab82e939cbd62a8460 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -79,31 +79,28 @@ class OperatorBase {
 
   virtual ~OperatorBase() {}
 
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
-    return boost::get<T>(attrs_.at(name));
-  }
-
-  /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const Scope* scope) const;
-
-  std::string DebugString() const { return DebugStringEx(nullptr); }
-
-  /// Net will call this interface function to Run an op.
+  /// Executor will call this interface function to Run an op.
   //  The implementation should be written at RunImpl
   void Run(const Scope& scope, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
 
-  virtual bool IsNetOp() const { return false; }
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+  std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
 
-  /// rename inputs outputs name
-  void Rename(const std::string& old_name, const std::string& new_name);
+  const std::string& Type() const { return type_; }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+  const AttributeMap& Attrs() const { return attrs_; }
 
   const VariableNameMap& Inputs() const { return inputs_; }
   const VariableNameMap& Outputs() const { return outputs_; }
@@ -112,7 +109,7 @@ class OperatorBase {
   std::string Input(const std::string& name) const;
   //! Get a input which has multiple variables.
   const std::vector<std::string>& Inputs(const std::string& name) const;
-
+  //! Get all inputs variable names
   std::vector<std::string> InputVars() const;
 
   //! Get a output with argument's name described in `op_proto`
@@ -120,13 +117,9 @@ class OperatorBase {
   //! Get an output which has multiple variables.
   //! TODO add a vector_view to prevent memory copy.
   const std::vector<std::string>& Outputs(const std::string& name) const;
-
+  //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
-  const std::string& Type() const { return type_; }
-  void SetType(const std::string& type) { type_ = type; }
-  const AttributeMap& Attrs() const { return attrs_; }
-
   // Return a new operator instance, which is as same as this.
   // Use unique_ptr to prevent caller forget to delete this pointer.
   virtual std::unique_ptr<OperatorBase> Clone() const = 0;
@@ -278,20 +271,6 @@ class ExecutionContext {
     return res;
   }
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, InputSize(in));
-    PADDLE_ENFORCE_LT(j, OutputSize(out));
-    auto* in_var = MultiInputVar(in)[i];
-    auto* out_var = MultiOutputVar(out)[j];
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-  }
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
   template <typename DeviceContextType>
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 25f622b725277ac9bcca4622902162f3edf147e8..1bf8c81469bb4afdd00921cfa0acf6089dedbbaa 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -226,10 +226,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 
 // test with multi inputs
 TEST(OpKernel, multi_inputs) {
-  using namespace paddle::framework;
-
   paddle::framework::InitDevices(true);
-  proto::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
@@ -243,12 +241,12 @@ TEST(OpKernel, multi_inputs) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<LoDTensor>();
-  scope.Var("x1")->GetMutable<LoDTensor>();
-  scope.Var("x2")->GetMutable<LoDTensor>();
-  scope.Var("k0")->GetMutable<LoDTensor>();
-  scope.Var("y0")->GetMutable<LoDTensor>();
-  scope.Var("y1")->GetMutable<LoDTensor>();
+  scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 106b5f866ed5225d67082310e308984d8b3f19ed..de644e851999920251c762a75c050e8182e950c6 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
 #endif
 
   std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
+  bool own_local_scope;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -56,21 +57,24 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
+    bool customize_scale_loss)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
+    member_->own_local_scope = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
+    member_->own_local_scope = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
     }
   }
 
@@ -87,12 +91,13 @@ ParallelExecutor::ParallelExecutor(
 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
-                                           params, member_->local_scopes_,
-                                           member_->nccl_ctxs_.get());
+  details::MultiDevSSAGraphBuilder builder(
+      member_->places_, loss_var_name, params, member_->local_scopes_,
+      customize_scale_loss, member_->nccl_ctxs_.get());
 #else
   details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
-                                           params, member_->local_scopes_);
+                                           params, member_->local_scopes_,
+                                           customize_scale_loss);
 #endif
   auto graph = builder.Build(main_program);
 
@@ -231,5 +236,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
+ParallelExecutor::~ParallelExecutor() {
+  if (member_->own_local_scope) {
+    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
+      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 303ac3bc55cfed57a03765b27d8aba581eabd1c8..49da123d98181c3d3abcdd64d14c5583142eba58 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -40,7 +40,9 @@ class ParallelExecutor {
                             const ProgramDesc& main_program,
                             const std::string& loss_var_name, Scope* scope,
                             const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay);
+                            bool allow_op_delay, bool customize_scale_loss);
+
+  ~ParallelExecutor();
 
   std::vector<Scope*>& GetLocalScopes();
 
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 77d17fbbccca0292e21acd5e8fa90448527b95c0..64fb028f83a539d17885186d5d8ee6ef26f095e9 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   return blocks_.back().get();
 }
 
-proto::ProgramDesc *ProgramDesc::Proto() {
+void ProgramDesc::Flush() {
   for (auto &block : blocks_) {
     block->Flush();
   }
+}
+
+proto::ProgramDesc *ProgramDesc::Proto() {
+  Flush();
   return &desc_;
 }
 
@@ -52,7 +56,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         }
       }
     }
@@ -69,7 +73,7 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         }
       }
     }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 4288081be72c44c0fc3584b50c41a270eac9e204..65fa0a0cfd5ba6d9b8765cee1309e118cb74348a 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -51,6 +51,8 @@ class ProgramDesc {
 
   size_t Size() const { return blocks_.size(); }
 
+  void Flush();
+
   proto::ProgramDesc *Proto();
 
   // The output variable of feed_op is referenced as feed_target.
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 107c5bf8ecbc3b46dd5fae87c73d0be4f74d1587..57c1b822d8d4f095f33cba2bfd5210f7ee19dd9f 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -14,19 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace framework {
 
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
+const char kFeedOpType[] = "feed";
+const char kFetchOpType[] = "fetch";
 
 bool HasDependentVar(const proto::OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
@@ -68,7 +68,7 @@ bool HasSubBlock(const proto::OpDesc& op_desc) {
 // the child block to help pruning
 void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
                 int block_id, int parent_block_id,
-                std::set<std::string>& dependent_vars) {
+                std::set<std::string>* dependent_vars) {
   auto& block = input.blocks(block_id);
   auto& ops = block.ops();
 
@@ -90,11 +90,11 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   std::vector<bool> should_run;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
+    if (IsTarget(op_desc) || HasDependentVar(op_desc, *dependent_vars)) {
       // insert its input to the dependency graph
       for (auto& var : op_desc.inputs()) {
         for (auto& argu : var.arguments()) {
-          dependent_vars.insert(argu);
+          dependent_vars->insert(argu);
         }
       }
       should_run.push_back(true);
@@ -138,7 +138,7 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
         // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
         // output_block_id is the idx of the current block in the output desc
         prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
-                   sub_block_dependent_vars);
+                   &sub_block_dependent_vars);
       }
     }
   }
@@ -181,7 +181,7 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   std::set<std::string> dependent_vars;
   output->clear_blocks();
-  prune_impl(input, output, 0, -1, dependent_vars);
+  prune_impl(input, output, 0, -1, &dependent_vars);
 }
 
 void inference_optimize_impl(proto::ProgramDesc* input, int block_id) {
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 194df3e4a8b50700e2be01ce5ebca83b92501fb8..9091713158c8071d5386f14250e3c546284e7fd0 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -26,6 +26,11 @@ DEFINE_bool(benchmark, false,
             "Default cuda is asynchronous device, set to True will"
             "force op run in synchronous mode.");
 
+DEFINE_bool(
+    eager_delete_scope, true,
+    "Delete local scope eagerly. It will reduce GPU memory usage but "
+    "slow down the destruction of variables.(around 1% performance harm)");
+
 namespace paddle {
 namespace framework {
 
@@ -91,13 +96,13 @@ std::vector<std::string> Scope::LocalVarNames() const {
   return known_vars;
 }
 
-void Scope::DeleteScope(Scope* scope) {
+void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_benchmark) {
+  if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
     delete scope;
   } else {
     Async([scope] { delete scope; });
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c8cb70549f1d131b66fa7c6eeb35f3b7151a9e7f..abc82e452d732638a2f7315022074850f299a7ea 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,7 +63,7 @@ class Scope {
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
 
-  void DeleteScope(Scope* scope);
+  void DeleteScope(Scope* scope) const;
 
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index d9d6b7dd67f1c6e4bbd6a4e1a8f0843d4cb93c05..794e7f743413b068119afd5df232bfc2bb91a8c7 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -17,6 +17,52 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct ReAllocateVisitor {
+  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
+      : tensor_(tensor), dims_(dims) {}
+
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    platform::CPUPlace cpu;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  framework::Tensor* tensor_;
+  framework::DDim dims_;
+};
+
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    platform::CPUPlace cpu;
+    memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -69,5 +115,66 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
   TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }
 
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
+                                       framework::Tensor* value) const {
+  PADDLE_ENFORCE(value->IsInitialized(),
+                 "The value tensor should be initialized.");
+  std::vector<int64_t> non_keys;
+  int64_t value_width = value_->numel() / value_->dims()[0];
+  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                    "output tensor should have the same shape with table "
+                    "execpt the dims[0].");
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    int64_t index = Index(keys[i]);
+    if (index == -1) {
+      non_keys.push_back(keys[i]);
+    } else {
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
+    }
+  }
+  return non_keys;
+}
+
+bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
+  if (value_->IsInitialized()) {
+    PADDLE_ENFORCE_EQ(
+        value.type(), value_->type(),
+        "The type of the value should be same with the original value");
+  }
+  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
+                    "The first dim of value should be 1.");
+  auto index = Index(key);
+  bool is_new_key = false;
+  if (index == -1) {
+    rows_.push_back(key);
+    index = rows_.size() - 1;
+    is_new_key = true;
+    // whether need to resize the table
+    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+      auto dims = value_->dims();
+      dims[0] = (dims[0] + 1) << 1;
+      framework::VisitDataType(framework::ToDataType(value.type()),
+                               ReAllocateVisitor(value_.get(), dims));
+    }
+  }
+
+  framework::VisitDataType(
+      framework::ToDataType(value.type()),
+      TensorCopyVisitor(value_.get(),
+                        index * value_->numel() / value_->dims()[0], value,
+                        static_cast<int64_t>(0), value.numel()));
+  return is_new_key;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 8e2d9470d3954e0f66c74828a8d8292c2875a8f4..d6c9507b1681855e759a4b1b9d3dddf6fcb2fc13 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -14,15 +14,33 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace paddle {
 namespace framework {
 
 class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
  public:
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
@@ -50,12 +68,45 @@ class SelectedRows {
 
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
-  /**
-   * get the index of id in rows
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of keys which does not exists in table
+   */
+  std::vector<int64_t> Get(std::vector<int64_t> keys,
+                           framework::Tensor* tensor) const;
+
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
    */
-  int64_t index(int64_t id) const {
-    auto it = std::find(rows_.begin(), rows_.end(), id);
-    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
 
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 960d8d64f04a819217413ff881977ce5fb5a30f2..39fe6d92940606084c28eec1a4d6486cb58844ce 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -17,7 +17,7 @@ namespace framework {
 
 class SelectedRowsTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     std::vector<int64_t> rows{0, 4, 7};
     int64_t height = 10;
     int64_t row_numel = 100;
@@ -59,5 +59,40 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
+TEST_F(SelectedRowsTester, Table) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+  // initialize a sparse table
+  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
+  table.mutable_value()->mutable_data<float>(cpu);
+  table.mutable_rows()->push_back(1);
+
+  int64_t key = 10000;
+  int64_t non_key = 999;
+  framework::Tensor value;
+  value.Resize(framework::make_ddim({1, 100}));
+  auto ptr = value.mutable_data<float>(cpu);
+  ptr[0] = static_cast<float>(10);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
+  ASSERT_EQ(table.HasKey(key), false);
+
+  table.Set(key, value);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
+  ASSERT_EQ(table.HasKey(key), true);
+  // check re-allocate
+  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+
+  framework::Tensor get_value;
+  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
+  std::vector<int64_t> keys({non_key, key});
+  auto non_keys = table.Get(keys, &get_value);
+
+  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
+  ASSERT_EQ(non_keys.size(), static_cast<size_t>(1));
+  ASSERT_EQ(non_keys[0], non_key);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index dc9a79020f103dadfd9837cffb18ad5946f95f31..ddff2c7c261746ac9986e79cff3da7e0a9654adc 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -11,8 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/framework/shape_inference.h"
-#include "grad_op_desc_maker.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index bc02d700da5186cea5f370b9676e408f62a66a68..46c8feec001584a872f7f62682080e0e72c06f50 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d1b01ae05b808b229309e9689165483a11530c84..d2e60ab1dd16758a91d22ef6872edc5053ef88b3 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, Tensor* dst) {
+                const platform::DeviceContext& ctx, Tensor* dst, bool sync) {
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
   src.check_memory_size();
@@ -47,9 +47,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        sync ? nullptr
+             : reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
+                   .stream();
+    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
@@ -58,18 +60,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        sync ? nullptr
+             : reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
+                   .stream();
+    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        sync ? nullptr
+             : reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
+                   .stream();
+    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   }
 #endif
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 38b6d1c5c46dcce718f91d574ceea5de2099b787..3af68402dc56230171e858bf8f8f8c89c2bfe760 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -23,7 +24,8 @@ namespace paddle {
 namespace framework {
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, Tensor* dst);
+                const platform::DeviceContext& ctx, Tensor* dst,
+                bool sync = false);
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
 
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 9687a86ca25be7886e67028a38e54b3065c8e4b5..6e10885890cd2d4a0d77834944b37e291197b637 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -105,16 +105,14 @@ TEST(TensorCopy, Tensor) {
 }
 
 TEST(TensorFromVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
+    paddle::framework::Tensor cpu_tensor;
 
     // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
+    cpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
 
     // Compare Tensors
     const int* cpu_ptr = cpu_tensor.data<int>();
@@ -125,8 +123,8 @@ TEST(TensorFromVector, Tensor) {
     }
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
     ASSERT_NE(src_ptr, cpu_ptr);
@@ -140,23 +138,23 @@ TEST(TensorFromVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
+    paddle::framework::Tensor cpu_tensor;
+    paddle::framework::Tensor gpu_tensor;
+    paddle::framework::Tensor dst_tensor;
 
     // Copy to CPU Tensor
     cpu_tensor.Resize(make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
+    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -172,11 +170,11 @@ TEST(TensorFromVector, Tensor) {
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
 
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -197,18 +195,16 @@ TEST(TensorFromVector, Tensor) {
 }
 
 TEST(TensorToVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    paddle::framework::Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, paddle::platform::CPUPlace());
     for (int i = 0; i < 3 * 3; ++i) {
       src_ptr[i] = i;
     }
 
-    CPUPlace place;
+    paddle::platform::CPUPlace place;
     std::vector<int> dst;
-    TensorToVector<int>(src, &dst);
+    paddle::framework::TensorToVector<int>(src, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_ptr[i], dst[i]);
@@ -217,13 +213,13 @@ TEST(TensorToVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor gpu_tensor;
-    CUDAPlace place;
-    CUDADeviceContext gpu_ctx(place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
-    TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
+    paddle::framework::TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_vec[i], dst[i]);
@@ -233,54 +229,54 @@ TEST(TensorToVector, Tensor) {
 }
 
 TEST(TensorContainsNAN, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    float* buf = src.mutable_data<float>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    float* buf = src.mutable_data<float>({3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1] = NAN;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1].x = 0x7fff;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 }
 
 TEST(TensorContainsInf, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    double* buf = src.mutable_data<double>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = INFINITY;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1].x = 0x7c00;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index 4766ec28aa3cff6be3259f258f1c9543ae471f5d..b4cff1e6c2293fa44f0fd0bb398a538c08dd4fb1 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -45,9 +45,8 @@ static __global__ void FillInf(platform::float16* buf) {
 }
 
 TEST(TensorContainsNAN, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -58,7 +57,8 @@ TEST(TensorContainsNAN, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsNAN(tensor));
@@ -66,9 +66,8 @@ TEST(TensorContainsNAN, GPU) {
 }
 
 TEST(TensorContainsInf, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -79,7 +78,8 @@ TEST(TensorContainsInf, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsInf(tensor));
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 4da83d630a5632233ddff6f08174dcabc1c696f8..27a4ffd4fcbf293a3dea1744b29384d0bee0c137 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -15,14 +15,14 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <atomic>
 
-#include "threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 
 namespace framework = paddle::framework;
 
-void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
   std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
+    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
   }
 }
 
@@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
   int n = 50;
   // sum = (n * (n + 1)) / 2
   for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, std::ref(sum), i);
+    std::thread t(do_sum, pool, &sum, i);
     threads.push_back(std::move(t));
   }
   for (auto& t : threads) {
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index f62415fda67a506763494886eb499fbb09c5caa6..9f7a21ef42b8d3e74b6e211d6254294ba1fa2341 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 1dced845ed7849d9f5a6de16dfe627d52fdb5488..9e33003b442762210c990b35f30bc3524963b8b4 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/var_type_inference.h"
+#include <string>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 87ddfe2ff9abfa3f4d99033686b197b10d8231fa..067e0c2b8389f88639fd9b95bd680702517efee1 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <memory>
+#include <string>
 #include <typeindex>
 #include <typeinfo>
 
@@ -67,7 +68,7 @@ class Variable {
   // parameter of Variable.
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+    explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
 
     virtual const std::type_info& Type() const { return type_; }
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index cc45bfe9b17d767be039cc0d8d83234b6994d6c1..50f635a41a99b2ae292d13afde5637a3bf4e6f8c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,7 +21,8 @@ endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
-  if (TENSORRT_FOUND)
-    add_subdirectory(tensorrt)
-  endif()
+endif()
+
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 3b58019db6e55fa8198d2f77731095c6cf356266..78d2f16746cf478c4424df929bd1f62b08f8a67c 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/io.h"
 
+#include <algorithm>
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -27,14 +28,14 @@ namespace inference {
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 
-void ReadBinaryFile(const std::string& filename, std::string& contents) {
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
   fin.seekg(0, std::ios::end);
-  contents.clear();
-  contents.resize(fin.tellg());
+  contents->clear();
+  contents->resize(fin.tellg());
   fin.seekg(0, std::ios::beg);
-  fin.read(&contents[0], contents.size());
+  fin.read(&(contents->at(0)), contents->size());
   fin.close();
 }
 
@@ -47,7 +48,7 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -92,18 +93,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
     op->CheckAttrs();
   }
 
-  executor.Run(*load_program, &scope, 0, true, true);
+  executor->Run(*load_program, scope, 0, true, true);
 
   delete load_program;
 }
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -113,11 +114,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor, framework::Scope& scope,
+    framework::Executor* executor, framework::Scope* scope,
     const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 756c936b33ad55e2994542b171b945e248ba2e21..ba3e45099ae7c1626bf11d9527d4fa4c7f772fec 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -27,17 +27,17 @@ namespace inference {
 
 void Init(bool init_p2p);
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index c3a8d0889c6a6dd9591837ccc523da56f8d13661..117472599f7c4874ab05e29c6ecb46fd61d0db9c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -133,12 +133,12 @@ void TestInference(const std::string& dirname,
       std::string prog_filename = "__model_combined__";
       std::string param_filename = "__params_combined__";
       inference_program = paddle::inference::Load(
-          executor, *scope, dirname + "/" + prog_filename,
+          &executor, scope, dirname + "/" + prog_filename,
           dirname + "/" + param_filename);
     } else {
       // Parameters are saved in separate files sited in the specified
       // `dirname`.
-      inference_program = paddle::inference::Load(executor, *scope, dirname);
+      inference_program = paddle::inference::Load(&executor, scope, dirname);
     }
   }
   // Disable the profiler and print the timing information
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index eddcaab8befda84dd14ed46c31ac025dfbcc7ca9..a177d4985fd0e2cca983b6873af89c60f526b811 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -32,7 +32,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+  }
 }
 
 template <>
@@ -40,7 +44,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+  }
 }
 
 template <>
@@ -49,10 +57,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     const void* src, size_t num, cudaStream_t stream) {
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+    if (stream) {
+      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+    } else {
+      platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
+    }
   } else {
-    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
-                            stream);
+    if (stream) {
+      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
+                                   num, stream);
+    } else {
+      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
+                                  num);
+    }
   }
 }
 
@@ -83,7 +100,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::CUDAPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
   platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+  }
 }
 
 template <>
@@ -92,7 +113,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPinnedPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
   platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+  }
 }
 
 #endif
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7d6781c2c38822eaabb64eda9c76ff657bbdeeb8..256aded8ca234a24229e11f27b9e3e25728ad293 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -110,12 +110,12 @@ function(op_library TARGET)
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
@@ -163,7 +163,12 @@ function(op_library TARGET)
 
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
     endif()
 endfunction()
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index b261144f3d7836801e0b7a45a1478d3b801db86d..87ef55c50b0be46492a695928625d140345d415d 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,11 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include <string>
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
 
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                  \
+  class OP_NAME##OpMaker                                                   \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {               \
+   public:                                                                 \
+    OP_NAME##OpMaker(OpProto *proto, OpAttrChecker *op_checker)            \
+        : ::paddle::framework::OpProtoAndCheckerMaker(proto, op_checker) { \
+      AddInput("X", "Input of " #OP_NAME "operator");                      \
+      AddOutput("Out", "Output of" #OP_NAME "operator");                   \
+      AddAttr<bool>("use_mkldnn",                                          \
+                    "(bool, default false) Only used in mkldnn kernel")    \
+          .SetDefault(false);                                              \
+      AddComment(#OP_COMMENT);                                             \
+    }                                                                      \
+  }
+
+#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
+  class OP_NAME##GradMaker                                                   \
+      : public ::paddle::framework::SingleGradOpDescMaker {                  \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
+      auto *op = new ::paddle::framework::OpDesc();                          \
+      op->SetType(#KERNEL_TYPE "_grad");                                     \
+      op->SetInput("Out", Output("Out"));                                    \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+                                                                             \
+      op->SetAttrMap(Attrs());                                               \
+                                                                             \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -37,346 +74,190 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Out", "Output of Sigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
 $$out = \frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Out", "Output of LogSigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
 $$out = \log \frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Exp operator");
-    AddOutput("Out", "Output of Exp operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.
 
 $out = e^x$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Relu operator");
-    AddOutput("Out", "Output of Relu operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.
 
 $out = \max(x, 0)$
 
-)DOC");
-  }
-};
-
-class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Out", "Output of LeakyRelu operator");
-    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
-    AddComment(R"DOC(
-LeakyRelu Activation Operator.
-
-$out = \max(x, \alpha * x)$
-
-)DOC");
-  }
-};
-
-class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softshrink operator");
-    AddOutput("Out", "Output of Softshrink operator");
-    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
-    AddComment(R"DOC(
-Softshrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+)DOC";
 
-)DOC");
-  }
-};
-
-class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Tanh operator");
-    AddOutput("Out", "Output of Tanh operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
 $$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Out", "Output of TanhShrink operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
 $$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
-
-class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of HardShrink operator");
-    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardShrink Activation Operator.
+)DOC";
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sqrt operator");
-    AddOutput("Out", "Output of Sqrt operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
 $out = \sqrt{x}$
 
-)DOC");
-  }
-};
+)DOC";
 
-class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Abs operator");
-    AddOutput("Out", "Output of Abs operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
 $out = |x|$
 
-)DOC");
-  }
-};
+)DOC";
 
-class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Ceil operator");
-    AddOutput("Out", "Output of Ceil operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
 $out = ceil(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Floor operator");
-    AddOutput("Out", "Output of Floor operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
 $out = floor(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class CosOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CosOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Cosine operator");
-    AddOutput("Out", "Output of Cosine operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
 Cosine Activation Operator.
 
 $out = cos(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class SinOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sine operator");
-    AddOutput("Out", "Output of Sine operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
 Sine Activation Operator.
 
 $out = sin(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Round operator");
-    AddOutput("Out", "Output of Round operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.
 
 $out = [x]$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Out", "Output of Reciprocal operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
 $$out = \frac{1}{x}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Log operator");
-    AddOutput("Out", "Output of Log operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.
 
 $out = \ln(x)$
 
 Natural logarithm of x.
 
-)DOC");
-  }
-};
+)DOC";
+
+__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
+Square Activation Operator.
+
+$out = x^2$
 
-class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+)DOC";
+
+__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+Softplus Activation Operator.
+
+$out = \ln(1 + e^{x})$
+
+)DOC";
+
+__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+Softsign Activation Operator.
+
+$$out = \frac{x}{1 + |x|}$$
+
+)DOC";
+
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Square operator");
-    AddOutput("Out", "Output of Square operator");
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
-Square Activation Operator.
+LeakyRelu Activation Operator.
 
-$out = x^2$
+$out = \max(x, \alpha * x)$
 
 )DOC");
   }
 };
 
-class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softplus operator");
-    AddOutput("Out", "Output of Softplus operator");
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softplus Activation Operator.
+Softshrink Activation Operator.
 
-$out = \ln(1 + e^{x})$
+$$
+out = \begin{cases} 
+    x - \lambda, \text{if } x > \lambda \\
+    x + \lambda, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
 
 )DOC");
   }
 };
 
-class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softsign operator");
-    AddOutput("Out", "Output of Softsign operator");
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
-Softsign Activation Operator.
+HardShrink Activation Operator.
 
-$$out = \frac{x}{1 + |x|}$$
+$$
+out = \begin{cases} 
+    x, \text{if } x > \lambda \\
+    x, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
 
 )DOC");
   }
@@ -553,100 +434,86 @@ $$out = \frac{x}{1 + e^{- \beta x}}$$
   }
 };
 
+REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
+REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
+REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
+REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
+REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
+REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
+REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
+REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc);
+REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
+REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
+REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
+REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
+
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
-            logsigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
-            tanh_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
-            softshrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-            leaky_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
-            hard_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
-            thresholded_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
-            hard_sigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
-            ops::ActivationOpGrad);
+#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
+  __macro(Sigmoid, sigmoid);                 \
+  __macro(Relu, relu);                       \
+  __macro(Exp, exp);                         \
+  __macro(Tanh, tanh);                       \
+  __macro(Ceil, ceil);                       \
+  __macro(Floor, floor);                     \
+  __macro(Sqrt, sqrt);                       \
+  __macro(SoftRelu, soft_relu);              \
+  __macro(Relu6, relu6);                     \
+  __macro(Reciprocal, reciprocal);           \
+  __macro(HardSigmoid, hard_sigmoid);
+
+#define FOR_EACH_OP_FUNCTOR(__macro) \
+  __macro(LogSigmoid, logsigmoid);   \
+  __macro(SoftShrink, softshrink);   \
+  __macro(Abs, abs);                 \
+  __macro(Cos, cos);                 \
+  __macro(Sin, sin);                 \
+  __macro(Round, round);             \
+  __macro(Log, log);                 \
+  __macro(Square, square);           \
+  __macro(BRelu, brelu);             \
+  __macro(Pow, pow);                 \
+  __macro(STanh, stanh);             \
+  __macro(Softplus, softplus);       \
+  __macro(Softsign, softsign);       \
+  __macro(LeakyRelu, leaky_relu);    \
+  __macro(TanhShrink, tanh_shrink);  \
+  __macro(ELU, elu);                 \
+  __macro(HardShrink, hard_shrink);  \
+  __macro(Swish, swish);             \
+  __macro(ThresholdedRelu, thresholded_relu);
+
+#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)        \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
+                    ::paddle::operators::OP_NAME##OpMaker,          \
+                    ::paddle::operators::OP_NAME##GradMaker);       \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
+
+#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
+                    ::paddle::operators::OP_NAME##OpMaker,              \
+                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
   REGISTER_OP_CPU_KERNEL(                                                 \
@@ -661,4 +528,6 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
       ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
+FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP);
+FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 4f745553c14fc1391bc65d4f7e4f9bd3b5a881c2..27487b396ccf63d962defa6b270063ccb409164e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 43856780bf9357281ac4af2968950da15426e5c8..912415192659dc004f54a76e9cd1a20581d512a6 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
+#include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -25,6 +28,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+/* Use ugly global variable, for the using in python layer side
+   Please refer to the layer_helper.py and get the details.
+ */
+static std::unordered_set<std::string> InplaceOpSet = {
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
+};
+
+static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
+
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -60,7 +73,6 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
     auto* Out = context.Input<framework::Tensor>("Out");
     auto* dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -68,7 +80,6 @@ class ActivationGradKernel
     dX->mutable_data<T>(context.GetPlace());
 
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenVector<T>::Flatten(*Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto* place =
@@ -78,7 +89,16 @@ class ActivationGradKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(*place, x, out, dout, dx);
+    bool inplace = functor.Inplace();
+    if (!inplace) {
+      auto* X = context.Input<framework::Tensor>("X");
+      auto x = framework::EigenVector<T>::Flatten(*X);
+      functor(*place, x, out, dout, dx);
+    } else {
+      VLOG(10) << " Inplace activation ";
+      auto x = framework::EigenVector<T>::Flatten(*dX);
+      functor(*place, x, out, dout, dx);
+    }
   }
 };
 
@@ -89,6 +109,14 @@ struct BaseActivationFunctor {
   using AttrPair = std::vector<std::pair<const char*, float*>>;
 
   AttrPair GetAttrs() { return AttrPair(); }
+
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
 };
 
 // sigmoid(x) = 1 / (1 + exp(-x))
@@ -102,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -156,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("exp"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -174,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
   }
 };
 
@@ -192,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("tanh"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -297,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sqrt"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -316,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("ceil"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / x;
+    dx.device(d) = static_cast<T>(0) / out;
   }
 };
 
@@ -432,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("reciprocal"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -531,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("relu6"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
-                       .template cast<T>();
+    dx.device(d) =
+        dout *
+        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+            .template cast<T>();
   }
 };
 
@@ -611,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("soft_relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto tmp = static_cast<T>(threshold);
-    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 };
@@ -791,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"slope", &slope}, {"offset", &offset}};
   }
-
+  bool Inplace() { return IsInplace("hard_sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69f79bf93be8ac7df9cab43b84cf755f2f3dfeaa
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilinear_interp_op.h"
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearInterpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of BilinearInterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of BilinearInterOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
+
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+  }
+};
+
+class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearInterpOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor of bilinear interpolation, "
+             "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddOutput("Out",
+              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
+
+    AddAttr<int>("out_h", "(int) output height of bilinear interpolation op.");
+    AddAttr<int>("out_w", "(int) output width of bilinear interpolation op.");
+    AddComment(R"DOC(
+          Bilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of two variables (e.g. H-direction and 
+          W-direction in this op) on a rectilinear 2D grid. 
+          
+          The key idea is to perform linear interpolation first in one 
+          direction, and then again in the other direction.
+            
+          For details, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Bilinear_interpolation
+         )DOC");
+  }
+};
+
+class BilinearInterpOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
+                  ops::BilinearInterpOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
+                       ops::BilinearInterpGradKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..82eb9e83bd84e6ec6881facbb2fac0aebce93d55
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilinear_interp_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratioW) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratioW * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratioW * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                          in_img_idy * in_img_w + in_img_idx];
+
+    // bilinear interpolation
+    out[out_id_h * output_w + out_id_w] =
+        h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+        h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                    w1lambda * in_pos[h_id * in_img_w + w_id]);
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratioW) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratioW * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratioW * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+    atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+    atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]);
+    atomicAdd(&in_pos[h_id * in_img_w + w_id],
+              h1lambda * w1lambda * out_pos[0]);
+  }
+}
+
+template <typename T>
+class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
+    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto* input = input_t->data<T>();
+    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    int batch_size = input_t->dims()[0];
+    int channels = input_t->dims()[1];
+    int in_h = input_t->dims()[2];
+    int in_w = input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(output, input, input_t->numel() * sizeof(T));
+    } else {
+      int threadNum = batch_size * out_chw;
+      int blocks = (threadNum + 1024 - 1) / 1024;
+
+      KeBilinearInterpFw<
+          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
+          input, in_h, in_w, batch_size, in_chw, output, out_h, out_w,
+          batch_size, out_chw, channels, ratio_h, ratio_w);
+    }
+  }
+};
+
+template <typename T>
+class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
+    auto* d_output = d_output_t->data<T>();
+
+    auto& device_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    zero(device_ctx, d_input_t, static_cast<T>(0.0));
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    int batch_size = d_input_t->dims()[0];
+    int channels = d_input_t->dims()[1];
+    int in_h = d_input_t->dims()[2];
+    int in_w = d_input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
+    } else {
+      int threadNum = batch_size * out_chw;
+      int blocks = (threadNum + 1024 - 1) / 1024;
+
+      KeBilinearInterpBw<
+          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
+          d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w,
+          batch_size, out_chw, channels, ratio_h, ratio_w);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(bilinear_interp,
+                        ops::BilinearInterpOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
+                        ops::BilinearInterpGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6cd77e4d49b53ecde6a84908cdffc7e1e02ac6a
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class BilinearInterpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
+    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto* input = input_t->data<T>();
+    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    int batch_size = input_t->dims()[0];
+    int channels = input_t->dims()[1];
+    int in_h = input_t->dims()[2];
+    int in_w = input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(output, input, input_t->numel() * sizeof(T));
+    } else {
+      for (int k = 0; k < batch_size; ++k) {  // loop for batches
+        for (int i = 0; i < out_h; ++i) {     // loop for images
+          int h = ratio_h * i;
+          int hid = (h < in_h - 1) ? 1 : 0;
+          T h1lambda = ratio_h * i - h;
+          T h2lambda = 1 - h1lambda;
+
+          for (int j = 0; j < out_w; ++j) {
+            int w = ratio_w * j;
+            int wid = (w < in_w - 1) ? 1 : 0;
+            T w1lambda = ratio_w * j - w;
+            T w2lambda = 1 - w1lambda;
+            // calculate four position for bilinear interpolation
+            const T* in_pos = &input[k * in_chw + h * in_w + w];
+            T* out_pos = &output[k * out_chw + i * out_w + j];
+
+            for (int c = 0; c < channels; ++c) {  // loop for channels
+              // bilinear interpolation
+              out_pos[0] =
+                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
+                  h1lambda * (w2lambda * in_pos[hid * in_w] +
+                              w1lambda * in_pos[hid * in_w + wid]);
+              in_pos += in_hw;
+              out_pos += out_hw;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class BilinearInterpGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
+    auto* d_output = d_output_t->data<T>();
+
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, d_input_t, static_cast<T>(0.0));
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    int batch_size = d_input_t->dims()[0];
+    int channels = d_input_t->dims()[1];
+    int in_h = d_input_t->dims()[2];
+    int in_w = d_input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
+    } else {
+      for (int k = 0; k < batch_size; ++k) {  // loop for batches
+        for (int i = 0; i < out_h; ++i) {     // loop for images
+          int h = ratio_h * i;
+          int hid = (h < in_h - 1) ? 1 : 0;
+          T h1lambda = ratio_h * i - h;
+          T h2lambda = 1 - h1lambda;
+
+          for (int j = 0; j < out_w; ++j) {
+            int w = ratio_w * j;
+            int wid = (w < in_w - 1) ? 1 : 0;
+            T w1lambda = ratio_w * j - w;
+            T w2lambda = 1 - w1lambda;
+            T* in_pos = &d_input[k * in_chw + h * in_w + w];
+            const T* out_pos = &d_output[k * out_chw + i * out_w + j];
+
+            for (int c = 0; c < channels; ++c) {  // loop for channels
+              in_pos[0] += h2lambda * w2lambda * out_pos[0];
+              in_pos[wid] += h2lambda * w1lambda * out_pos[0];
+              in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
+              in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
+              in_pos += in_hw;
+              out_pos += out_hw;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 2ec984d8e0f07b741f5e36f281134c0469079afd..e910ad92d1051aa89fdb3290a977ff376378a227 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
-            ops::BilinearTensorProductOpGrad);
+REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
+                  ops::BilinearTensorProductOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
     ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
index 844b3ae3b7bf87c9b253128165b3c938801d5d60..25c5c3c95ef6899589c98570df6ecbf9b3241d89 100644
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -29,11 +29,11 @@ namespace paddle {
 namespace operators {
 
 void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable &status_var, bool status) {
+                      framework::Variable *status_var, bool status) {
   auto cpu = platform::CPUPlace();
   auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
+      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                         cpu);
   status_tensor[0] = status;
 }
 
@@ -66,7 +66,7 @@ class ChannelRecvOp : public framework::OperatorBase {
     bool ok = concurrency::ChannelReceive(ch, output_var);
 
     // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
   }
 };
 
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index a3b67964c79268e6ce07018501c46163847897ad..c71139fc7c01a696299296e43d06cf195fb3d03f 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
-            ops::ClipOpGrad);
+REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4a36b03cb63ac3ea61be1bbc56b8dd0adbe7d334..3bb3bd4eb15881afb5ae42beb944b76b5e8207cb 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -103,10 +103,12 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-               ops::ConcatOpGrad, false)
+REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index bff2c34ec893d0e6212426b108dd98b0d0d0fb48..27f74a789beef02d31ebceb9b909e97ebd68232a 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -47,7 +47,7 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&  // NOLINT
           ips[0]->numel() == 1)) {
       PADDLE_THROW(
           "condition input's data type should be bool, "
@@ -227,7 +227,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
     grad_op->SetOutput(framework::GradVarName("Params"),
                        InputGrad("Params", false));
-    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
   }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 695db841a4ec666b2c8783dfc7df959711341d85..92748993c32ffb93ae25db8d9916798e657cc804 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -335,14 +335,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index a1a0b00208fe77ad462062b5d0cb0c5f3065f584..82fdd308207adb159632dbb9decd67fd2d1c4646 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -193,8 +193,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
 REGISTER_OP_CPU_KERNEL(conv_shift,
                        ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 08f5939d42a41d235a94eff16cf2f558068d6aaa..d699dcafa4e2c7e0a3ffb62ec3985e4961fa2133 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -298,8 +298,10 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
@@ -311,8 +313,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
+                  ops::Conv3DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 4c8af408f62453eaf22cc23d19844e8ca7625bfa..04ca878e687f9b8e5239d8c4aad7e5f262fda0fa 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -153,8 +153,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
-            ops::CosSimOpGrad);
+REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
 REGISTER_OP_CPU_KERNEL(
     cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fd7ea70c64fafd0a7ea55ec1e3a29eb66d84a2c6..a8f1fbd529c71d1915c75fa90b7e4e8239d2fa3f 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -153,7 +153,9 @@ class CropOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 55810371c8d354483138b0673721a1ea39fa6f35..0e0622e290f42811c83c354d749ef32a2d9dcadb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -164,8 +164,9 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
                        ops::CrossEntropyOpKernel<double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 0da6f188523a78693929307a08601e04002bc8ec..f7c516a0ba375a68e3adeb44c99f2808dc0418bb 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -79,4 +79,4 @@ using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
 REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 70e2a1de5e24302646611cfea3b8dbe1562274e2..eb5fd99ccb844b1f1717b818e7807a384d6515eb 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -19,4 +19,4 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
                         ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index 4425b19328f503eb7f9022916ed6452cdfea4eeb..f6229b71bc01a6de51f50f5fe880ada6e15e74dd 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -29,12 +29,12 @@ limitations under the License. */
 #include "grpc++/support/byte_buffer.h"
 #include "grpc++/support/slice.h"
 #include "grpc/support/log.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 119e146e078e476b2768a8495ea63e468f952fd2..8cee46cbb2d6a1002864916e250fb7ab30f91430 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -90,7 +90,7 @@ class RequestGet final : public RequestBase {
                       ::grpc::ServerCompletionQueue* cq,
                       framework::Scope* scope,
                       const platform::DeviceContext* dev_ctx,
-                      SimpleBlockQueue<MessageWithName>* queue)
+                      framework::BlockingQueue<MessageWithName>* queue)
       : RequestBase(service, cq, dev_ctx),
         responder_(&ctx_),
         scope_(scope),
@@ -128,7 +128,7 @@ class RequestGet final : public RequestBase {
   sendrecv::VariableMessage request_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* scope_;
-  SimpleBlockQueue<MessageWithName>* queue_;
+  framework::BlockingQueue<MessageWithName>* queue_;
 };
 
 class RequestPrefetch final : public RequestBase {
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index b6110f92ed4f38a156e0c99ecfb399f3f47a169e..a15c93b7830265a2bb22334b5bb5a0f8ee2f28f4 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <utility>
 
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -29,7 +30,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
@@ -37,7 +37,7 @@ namespace detail {
 
 typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
     ReceivedMessage;
-typedef SimpleBlockQueue<ReceivedMessage> ReceivedQueue;
+typedef framework::BlockingQueue<ReceivedMessage> ReceivedQueue;
 
 typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
@@ -59,15 +59,13 @@ class AsyncGRPCServer final {
 
   void SetProgram(framework::ProgramDesc *program) { program_ = program; }
 
-  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
-
   void SetExecutor(framework::Executor *executor) { executor_ = executor; }
 
   void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
     prefetch_ctx_ = prepared;
   }
 
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() const { return selected_port_; }
 
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
@@ -101,7 +99,7 @@ class AsyncGRPCServer final {
   const platform::DeviceContext *dev_ctx_;
 
   // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_get_queue_;
+  framework::BlockingQueue<MessageWithName> var_get_queue_;
   // client send variable to this queue.
   ReceivedQueue var_recv_queue_;
 
@@ -114,7 +112,6 @@ class AsyncGRPCServer final {
   std::unique_ptr<std::thread> t_get_;
   std::unique_ptr<std::thread> t_prefetch_;
 
-  int prefetch_blk_id_;
   framework::ExecutorPrepareContext *prefetch_ctx_;
   framework::ProgramDesc *program_;
   framework::Executor *executor_;
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 16c612c45a37dd2ffd17f8d5f5946df30e9b3fe6..766bcf1ac5e06628638fcc8a305c00ab2795bbf2 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -39,7 +39,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   // parallelism execution, need to know when to free the tensor.
   DestroyCallback destroy_callback = [](void* backing) {};
 
-  void* buf = malloc(1024);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+
   void* payload = nullptr;
   size_t payload_size;
   ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
@@ -82,7 +84,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
+        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
         payload = memory::Alloc(cpu, copy_size);
 
         memory::Copy(cpu, payload,
@@ -99,7 +101,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = tensor.data<void>();
       }
-      payload_size = tensor.memory_size();
+      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     case framework::proto::VarType_Type_SELECTED_ROWS: {
@@ -118,7 +120,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
+        auto copy_size =
+            tensor->numel() * framework::SizeOfType(tensor->type());
         payload = memory::Alloc(cpu, copy_size);
         memory::Copy(cpu, payload,
                      boost::get<platform::CUDAPlace>(tensor->place()),
@@ -133,7 +136,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = slr->mutable_value()->data<void>();
       }
-      payload_size = tensor->memory_size();
+      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     default:
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index cb5f89583436b059ac4d6509dac9f2e3868561aa..221d2f4c5b30aef022a5d6b54cd657d1dec1f5a2 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -51,7 +51,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
   std::vector<::grpc::Slice> slices;
@@ -129,7 +129,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
   std::vector<::grpc::Slice> slices;
diff --git a/paddle/fluid/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h
deleted file mode 100644
index 69773e05df7ed76f31c26f4304693fec2e9aac9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include <mutex>  // NOLINT
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-template <typename T>
-class SimpleBlockQueue {
- private:
-  std::mutex mutex_;
-  std::condition_variable condition_;
-  std::deque<T> queue_;
-
- public:
-  void Push(T const& value) {
-    {
-      std::unique_lock<std::mutex> lock(this->mutex_);
-      queue_.push_front(value);
-    }
-    this->condition_.notify_one();
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(this->mutex_);
-    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
-    T rc(std::move(this->queue_.back()));
-    this->queue_.pop_back();
-    return rc;
-  }
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index c9d7fd6d1581f6f4182e9e3e0d633c13a3c336a5..fbef8d02a4d765052fccf3792ebe0373d46b1ef6 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -114,8 +114,7 @@ bool VariableResponse::CopyLodTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
   tensor->Resize(dims);
 
   framework::LoD lod;
@@ -151,8 +150,7 @@ bool VariableResponse::CopySelectRowsTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
@@ -174,8 +172,7 @@ bool VariableResponse::CopySelectRowsTensorData(
 bool VariableResponse::CopySelectRowsData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->mutable_rows()->resize(length /
                               framework::SizeOfType(typeid(int64_t)));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index 93b0d3cfb4f7d7f336414361773f872d7b259482..3018a5c4af876828380ff4c1cbfdaafa8a2057e1 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -37,10 +37,17 @@ namespace detail {
 class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx) {}
-
-  virtual ~VariableResponse() {}
+                   const platform::DeviceContext* dev_ctx,
+                   bool create_scope = false)
+      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
+    if (create_scope) {
+      local_scope_ = &scope->NewScope();
+    }
+  }
+
+  virtual ~VariableResponse() {
+    if (create_scope_) scope_->DeleteScope(local_scope_);
+  }
 
   // return:
   // 0:ok.
@@ -54,11 +61,18 @@ class VariableResponse {
   // other: number of error field.
   int Parse(const ::grpc::ByteBuffer& byte_buffer);
 
+  const framework::Scope& GetLocalScope() const { return *local_scope_; }
+
   inline std::string Varname() { return meta_.varname(); }
   inline std::string OutVarname() { return meta_.out_varname(); }
 
   // should call parse first.
-  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
+  framework::Variable* GetVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.varname());
+    }
+    return scope_->FindVar(meta_.varname());
+  }
 
  private:
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
@@ -75,6 +89,8 @@ class VariableResponse {
  private:
   const framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
+  bool create_scope_ = false;
+  framework::Scope* local_scope_ = nullptr;
   // only Skeleton
   sendrecv::VariableMessage meta_;
 };
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index e4436549f6185ba04a5f270893596a6dcb11e89b..4ed1b548840fabd2383632beb5f35fa6aa096443 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -101,8 +101,9 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
-            ops::DropoutOpGrad);
+REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 184c095e487a302ebc4d251dd6f332333c415c6d..1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -32,14 +32,26 @@ __global__ void RandomGenerator(const size_t n, const int seed,
   thrust::uniform_real_distribution<float> dist(0, 1);
 
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  T mask;
+  T dest;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
-    rng.discard(idx);
+    T s = src[idx];
+    if (step_size == 0) {
+      rng.discard(idx);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      rng.discard(step_size);
+    }
     if (dist(rng) < dropout_prob) {
-      mask_data[idx] = static_cast<T>(0);
+      mask = static_cast<T>(0);
     } else {
-      mask_data[idx] = static_cast<T>(1);
+      mask = static_cast<T>(1);
     }
-    dst[idx] = mask_data[idx] * src[idx];
+    dest = s * mask;
+    mask_data[idx] = mask;
+    dst[idx] = dest;
   }
 }
 
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 6f9a090c8ea660d023acece096b48d29aa2f35f7..c7ddafcad1d1f6c14791fde665f43881d6b49836 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -30,8 +30,10 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
+                  ops::ElementwiseDivOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
index 61da7c59441df22d71316b13f131399d3cd55f3a..a4fe386bb1907bf7c0099d2b1109077b21146948 100644
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
+                  ops::ElementwiseMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
index c74ff36db17579182e3c7e93a5adc5fe79fbcadd..68cd6ddb4a938b2b1c33e3f89c6d1151acb27f48 100644
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMinOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
+                  ops::ElementwiseMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 5d7f2cdffd11dfef8df22175dd0570b277c0e13a..2dec27136ad57ea032d5abb51799bd04ccc0b2e3 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -31,8 +31,10 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
+                  ops::ElementwiseMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 415182201a7a9e11d8ea8c62b92849b5ea3bac3e..f0362ec606c994d69f31c7a2e1e9ad0d0108b621 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -356,8 +356,8 @@ __device__ T reduceSum(T val, int tid, int len) {
   // I use Warp-Level Parallelism and assume the Warp size
   // is 32 which may be different for different GPU,
   // but most card's warp size is 32.
-  __shared__ T shm[32];
   const int warpSize = 32;
+  __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);
 
@@ -371,6 +371,7 @@ __device__ T reduceSum(T val, int tid, int len) {
   if (tid % warpSize == 0) {
     shm[tid / warpSize] = val;
   }
+  __syncthreads();
 
   CREATE_SHFL_MASK(mask, tid < warpSize);
 
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index 6f770820c80310a183018b586cb7545ca1e9de51..9d0598fc39a3922fa830f18729d90a7dac6a890b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -29,8 +29,10 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
+                  ops::ElementwiseSubOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 51a66bd832fbdface953d9b7b509b32ce26d33ca..4ae91d074d3df8b910a7f5d816a22b6f1d51dff6 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -128,8 +129,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
-            ops::ExpandGradOp);
+REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 2c2d5c7c42c0cc918199eff054d1656f01a281e8..75dbf1d8bf5cb692dcf7b88e9f4c486ab3839701 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -14,13 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include <boost/preprocessor/arithmetic/div.hpp>
 #include <boost/preprocessor/arithmetic/mod.hpp>
 #include <boost/preprocessor/comparison/greater.hpp>
 #include <boost/preprocessor/comparison/greater_equal.hpp>
 #include <boost/preprocessor/control/if.hpp>
 #include <boost/preprocessor/repetition/repeat.hpp>
-#include <iostream>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 381771f157d78fb04e54f0a07c40e4df2c91441a..45e4d5b2b863a55ae0aa0414ff8697141fd2aa6f 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -98,5 +98,6 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
-            paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6be06b8816ce65641b49d7b7b3861cdd8460feaa..4c82f5c429038504d9876ee240a705911feb0b7a 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -100,7 +100,8 @@ Out = [[3, 4],
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
+REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 3819549c7112c5e4a6de1a9aee54e469dd5a4618..7e014dd1cb47ee0575308dc13ba7bc7617baebff 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 5a8b1ebbe3fe5f242a4d6395c921c75247587c6a..2dd726bebb1bc2e4d83844c0b98df01c390e622f 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 7625bd45d968720099a973a6988484ec8332d1c1..9c0561b016fdbfa8e48535eaa673a3f85bc936e5 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,38 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
 
-TEST(Gather, GatherData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/platform/place.h"
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+TEST(Gather, GatherData) {
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   int* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+  p_src = src->mutable_data<int>(paddle::framework::make_ddim({3, 4}),
+                                 paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({2}),
+                                     paddle::platform::CPUPlace());
 
   for (int i = 0; i < 12; ++i) p_src[i] = i;
   p_index[0] = 1;
   p_index[1] = 0;
 
-  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+  int* p_output = output->mutable_data<int>(
+      paddle::framework::make_ddim({2, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, *src, *index, output);
+  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index 9002ce4717c6e75e7204ef62094e4680bba3f88b..0d7219ac5c624236b85916d5faf6810dbed2198a 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
index 58fe32446217e07235b40b9b78190094e57e4951..b8e1556c23a3b7357ed56d1b83c09622559040a4 100644
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2490b83b8c50ce4a68095be10d78a380174c1a3f..0a524c914d305661745c5d85cbbee2edb57c97ba 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -216,7 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
 REGISTER_OP_CPU_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 1d5c291495c0f0c0d8da9ff6949888b4cbb6036d..53f844a6607bd2e98c53b53c23422f6b48e2ced6 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -56,8 +56,6 @@ class GRUKernel : public framework::OpKernel<T> {
     auto* hidden = context.Output<LoDTensor>("Hidden");
     hidden->mutable_data<T>(context.GetPlace());
 
-    context.ShareLoD("Input", "Hidden");
-
     auto hidden_dims = hidden->dims();
 
     bool is_reverse = context.Attr<bool>("is_reverse");
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index f4c766db0a12b9d2167b0ee3b1d7666c4f1813f1..f8d1d44b5423dd09fe5aad11434911af6f14fe77 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -124,7 +124,7 @@ $$
 
 which is same as one time step of GRU Operator.
 
-@note To implement the complete GRU unit, fully-connected operator must be 
+@note To implement the complete GRU unit, fully-connected operator must be
 used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
@@ -194,12 +194,45 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class GRUUnitGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("gru_unit_grad");
+
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("HiddenPrev", Input("HiddenPrev"));
+    op->SetInput("Weight", Input("Weight"));
+    op->SetInput("Bias", Input("Bias"));
+
+    op->SetInput("Hidden", Output("Hidden"));
+    op->SetInput("Gate", Output("Gate"));
+    op->SetInput("ResetHiddenPrev", Output("ResetHiddenPrev"));
+    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("HiddenPrev"),
+                  InputGrad("HiddenPrev"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
-            ops::GRUUnitGradOp);
+
+REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
+                  ops::GRUUnitGradOpMaker);
+REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp);
+
 REGISTER_OP_CPU_KERNEL(
     gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index efe84f14098028675cb332efd9545c9709528cb3..086b5a97dec9a3d5b8f91b802b92d64ca73bf57c 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -103,8 +103,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
     ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 134b23b4612b478f9aeb06454c9fd9a6c25fffb4..74d8e0e2b76adc7a3e69649f277a8c0df6f38056 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -121,8 +121,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss,
     ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 5b387d8d344dfc3475a537827acd9e125fe6693c..8c120eec86601146500721bbb4249bc458190093 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -148,8 +148,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
     im2sequence,
     ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index ec2e641679fedec776d48716f13445f44375ce3d..d8c97b27b328b1470bece4a6c1872b5ccc75115e 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/increment_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -89,4 +90,4 @@ REGISTER_OP_CPU_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
index 7fb6425fe994751c4d7a025bb62e43a84c8d95c2..228063bf3d4b24bbd03649189f6ddba9a5f0ca30 100644
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/operators/increment_op.cu
@@ -19,4 +19,4 @@ REGISTER_OP_CUDA_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 963b0587c386c72c05f8cc5d0b63074e9e726579..0c143b7c8aed13a202e2597632d17d8bccc8b66d 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -67,8 +67,9 @@ $$Out = \sum{|X|}$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
-            ops::L1NormGradOp);
+REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index c2a8c7f867a4483a7fda2f4336a64ab109ce86e8..a73c626032f3bf6e97ac5974424e76bacb9a0799 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -117,8 +117,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
 REGISTER_OP_CPU_KERNEL(
     label_smooth,
     ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 88b3b08af57eaf2d1086d778e3313c3dea6300fb..de1056aef7bfa2f53f8a92b262e7d15aa7c2b75c 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ef568a578b0b97ea402a2a521f0fe1431013d1b7..2f29e377fdada918f2c9dca8c2d94eb06278320d 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -256,8 +256,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
+                  ops::LinearChainCRFOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
     ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index a4c925b538ef916e88ec06cea6de57f31eaf069b..af235fb6a029a71ee275bebfbbd75aaa0b7d546d 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -27,20 +27,6 @@ void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
   VLOG(4) << "RunServer thread end";
 }
 
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
-
 static void ParallelExecuteBlocks(
     const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
     const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
@@ -62,6 +48,13 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
+static void SavePort(std::shared_ptr<detail::AsyncGRPCServer> rpc_service) {
+  std::ofstream port_file;
+  port_file.open("/tmp/paddle.selected_port");
+  port_file << rpc_service->GetSelectedPort();
+  port_file.close();
+}
+
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
@@ -77,59 +70,26 @@ void ListenAndServOp::Stop() {
   server_thread_->join();
 }
 
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  if (!rpc_service_) {
-    std::string endpoint = Attr<std::string>("endpoint");
-    rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-  }
-
-  auto ins = Inputs("X");
+void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
+                                  framework::ProgramDesc *program,
+                                  framework::Scope *recv_scope,
+                                  framework::BlockDesc *prefetch_block) const {
   auto fan_in = Attr<int>("Fanin");
-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
-  auto *program = optimize_block->Program();
+
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    if (blkid != static_cast<size_t>(prefetch_block->ID())) {
-      block_list.push_back(blkid);
-    }
+    block_list.push_back(blkid);
   }
-  auto optimize_prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, block_list);
   // Insert placeholder for block0 which holds current op itself.
   optimize_prepared.insert(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  // TODO(qiao) set proper fields for table lookup and update
-  rpc_service_->SetExecutor(&executor);
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
-  prefetch_prepared.release();
-  rpc_service_->SetProgram(program);
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  sleep(5);
-  // Write to a file of server selected port for python use.
-  std::ofstream port_file;
-  port_file.open("/tmp/paddle.selected_port");
-  port_file << rpc_service_->GetSelectedPort();
-  port_file.close();
-
   bool exit_flag = false;
   // Record received sparse variables, so that
   // we could reset those after execute optimize program
@@ -170,7 +130,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       break;
     }
 
-    // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
 
     // The optimize blocks which have the same parent ID would run parallel
@@ -182,16 +142,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
       if (blkid != static_cast<size_t>(prefetch_block->ID())) {
         if (program->Block(blkid).Parent() != last_parent_blkid) {
-          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-                                program, &recv_scope);
+          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                                program, recv_scope);
           parallel_blkids.clear();
           last_parent_blkid = program->Block(blkid).Parent();
         }
         parallel_blkids.push_back(blkid);
       }
     }
-    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-                          program, &recv_scope);
+    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
+                          recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
     // Reset the received sparse variables, the sum operator would not
@@ -209,6 +169,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   }  // while(true)
 }
 
+void ListenAndServOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+
+  PADDLE_ENFORCE(!rpc_service_);
+  std::string endpoint = Attr<std::string>("endpoint");
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
+  framework::Executor executor(dev_place);
+
+  // prepare rpc_service
+  rpc_service_->SetScope(&recv_scope);
+  rpc_service_->SetDevCtx(&dev_ctx);
+  rpc_service_->SetProgram(program);
+  rpc_service_->SetExecutor(&executor);
+
+  // prepare for prefetch
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
+
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  VLOG(3) << "wait server thread to become ready...";
+  sleep(5);
+  // Write to a file of server selected port for python use.
+  SavePort(rpc_service_);
+  RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+}
+
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 9744921cef7c0f13c94b7fe729561de8e181650c..dfb7c77c8e36d9af79d8b1713d0c0c59c81b1ca6 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -34,17 +34,22 @@ void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
-  ListenAndServOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs);
+  ListenAndServOp(const std::string& type,
+                  const framework::VariableNameMap& inputs,
+                  const framework::VariableNameMap& outputs,
+                  const framework::AttributeMap& attrs);
 
   int GetSelectedPort() const;
 
+  void RunSyncLoop(framework::Executor* executor,
+                   framework::ProgramDesc* program,
+                   framework::Scope* recv_scope,
+                   framework::BlockDesc* prefetch_block) const;
+
   void Stop() override;
 
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override;
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override;
 
  protected:
   mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 7d5687f2d0666d393d7bb1c1a2fdde6c95e6d615..92ebfc274b84f738f5bd688a9a6d9f437b6318aa 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -155,8 +155,9 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
-            ops::LoDResetGradOp);
+REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
     ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index f44996d8ac746a33750a979eff2cbbc84e10214b..a8258a1afd70574c174abe8d5630ade5d4ac3de6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -106,8 +106,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
-            ops::LogLossGradOp);
+REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index cb088c267bcc028ff11583cd73de5ca1722a9b69..d482506bf0361c11a019e32efbf348a64aaf5164 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -103,7 +103,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.index(ids[i]);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
           memcpy(output + i * row_width, table + id_index * row_width,
                  row_width * sizeof(T));
         }
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 553a06c3dcdbb9de43afcace75ebec7c5e819d4a..f5c0e47fda913b4635833c31496644b60a0a8504 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -276,7 +276,9 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
 REGISTER_OP_CPU_KERNEL(
     lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index e062d62c66c25e386c7643e310034bc1481ec43d..084ee1cfe602af3622ef2a3f35f2892d5540cec7 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -273,7 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index b3c9d7c34d1ac54fb3e15a60bcc470f392bf5027..e1157ef6c640be17e7f48abe1ab972cf88504526 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -97,8 +97,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            ops::LstmUnitGradOp);
+REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 82541517e122d5da2674b55561ba72af970a2567..f9261323f0f50c78b3b4b66a9fa8abcdf5ba27e9 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -322,8 +322,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
-            ops::LSTMPGradOp);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b146b5088321efcee5a4511b3fedd047a0d54f00..0b41a3e1ffdb32d248bb55651aba242336307e74 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -111,9 +111,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
-            ops::MarginRankLossGradOp);
+REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
+                  ops::MarginRankLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
     ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 1f5255887391218b766aa23842e443c8b2ad080f..e5d33fbc36438f97ff5b604e4efdbfbfa91fcee4 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -237,8 +237,9 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
-            ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 4e28d98834d27351be99106d6760eae46baf8938..e2bcba5a5e15d4d5f10ae4ae64b5262f750137ab 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -101,8 +101,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-            ops::MaxOutOpGrad);
+REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
index 083d03ebe610521c5a4beb7b977a8179700bcf40..f26a165b5a59f01f864d62bbf798f4cbffa65371 100644
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -60,7 +60,7 @@ class MKLDNNActivationGradKernel
   }
 };
 
-namespace {
+namespace {  // NOLINT
 framework::OpKernelType GetKernelType(
     const framework::ExecutionContext& ctx,
     const framework::OperatorWithKernel& oper) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index a8fbd48c4da5b2d0585688e3100f9fe62ac5aa1f..3a0fc74584391d0441105a8ac7d7ac292e10fb8d 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -108,9 +108,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
-            ops::ModifiedHuberLossGradOp);
+REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
+                  ops::ModifiedHuberLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
diff --git a/paddle/fluid/operators/mul_mkldnn_op.cc b/paddle/fluid/operators/mul_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5f3a98f678a870d30eebfc4cf329de7c93266ee
--- /dev/null
+++ b/paddle/fluid/operators/mul_mkldnn_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+template <typename Format = mkldnn::memory::format>
+mkldnn::memory::desc type(const std::vector<int>& dims, Format&& f) {
+  return platform::MKLDNNMemDesc(dims, mkldnn::memory::data_type::f32, f);
+}
+
+template <typename T>
+class MulMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    auto input = ctx.Input<Tensor>("X");
+    auto weight = ctx.Input<Tensor>("Y");
+
+    PADDLE_ENFORCE(input->dims().size() & (2 | 4),
+                   "Input must be with 2 or 4 dimensions, i.e. NC or NCHW");
+    PADDLE_ENFORCE(weight->dims().size() & (2 | 4),
+                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
+
+    std::vector<int> w_tz = paddle::framework::vectorize2int(weight->dims());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+
+    auto src_md =
+        src_tz.size() != 2
+            ? type(src_tz, mkldnn::memory::format::nchw)
+            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
+
+    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
+
+    auto weights_md =
+        src_tz.size() != 2
+            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
+                   mkldnn::memory::format::oihw)
+            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
+
+    auto output = ctx.Output<Tensor>("Out");
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_fc_pd = key + "@mul_pd";
+
+    const T* input_data = input->data<T>();
+    const T* w_data = weight->data<T>();
+
+    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
+
+    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
+                                     platform::to_void_cast(input_data));
+
+    auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
+                                         platform::to_void_cast(w_data));
+
+    auto pd = platform::MKLDNNFwdPrimitiveDesc<mkldnn::inner_product_forward>(
+        mkldnn_engine, src_md, weights_md, dst_md);
+
+    dev_ctx.SetBlob(key_fc_pd, pd);
+
+    auto forward = mkldnn::inner_product_forward(*pd, src_memory,
+                                                 weights_memory, dst_memory);
+
+    std::vector<mkldnn::primitive> pipeline = {forward};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+template <typename T>
+class MulMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    const Tensor* input = ctx.Input<Tensor>("X");
+    const Tensor* w = ctx.Input<Tensor>("Y");
+
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_fc_pd = key + "@mul_pd";
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    const T* out_grad_data = out_grad->data<T>();
+    T* input_grad_data = nullptr;
+    T* w_grad_data = nullptr;
+
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (w_grad) {
+      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> w_tz = paddle::framework::vectorize2int(w->dims());
+
+    auto src_md =
+        src_tz.size() != 2
+            ? type(src_tz, mkldnn::memory::format::nchw)
+            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
+
+    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
+
+    auto weights_md =
+        src_tz.size() != 2
+            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
+                   mkldnn::memory::format::oihw)
+            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
+
+    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
+                                     platform::to_void_cast(input_data));
+
+    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine},
+                                     platform::to_void_cast(out_grad_data));
+
+    auto weight_memory = mkldnn::memory({weights_md, mkldnn_engine},
+                                        platform::to_void_cast(w_data));
+
+    auto pd =
+        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fc_pd));
+
+    PADDLE_ENFORCE(pd != nullptr, "Fail to find pd in device context");
+
+    if (w_grad) {
+      auto weights_grad_memory = mkldnn::memory(
+          {weights_md, mkldnn_engine}, platform::to_void_cast(w_grad_data));
+
+      auto bwd_weight_pd = platform::MKLDNNBwdPrimitiveDesc<
+          mkldnn::inner_product_backward_weights>(mkldnn_engine, *pd, src_md,
+                                                  weights_md, dst_md);
+
+      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
+          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      auto src_grad_memory = mkldnn::memory(
+          {src_md, mkldnn_engine}, platform::to_void_cast(input_grad_data));
+
+      auto bwd_data_pd =
+          platform::MKLDNNBwdPrimitiveDesc<mkldnn::inner_product_backward_data>(
+              mkldnn_engine, *pd, src_md, weights_md, dst_md);
+
+      auto bwd_data_prim = mkldnn::inner_product_backward_data(
+          bwd_data_pd, dst_memory, weight_memory, src_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::MulMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::MulMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 5038287527c70d376d8c8a1cc8e4cca0b563126a..c9fabc8d485b3bba2c8ae14b3616d0bdcae058a7 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -71,6 +76,22 @@ class MulOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout, library);
+  }
 };
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -100,6 +121,9 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddAttr<int>(
         "y_num_col_dims",
         R"DOC((int, default 1), The mul_op can take tensors with more than two,
@@ -154,13 +178,31 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout, library);
+  }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index 0f80f752c95e97ed4d6d299788734de9d29713db..a12b975326519c776c9f4a1d9f2894b4028c2440 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -173,8 +173,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
   void MultiClassNMS(const framework::ExecutionContext& ctx,
                      const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>& indices,
-                     int& num_nmsed_out) const {
+                     std::map<int, std::vector<int>>* indices,
+                     int* num_nmsed_out) const {
     int64_t background_label = ctx.Attr<int>("background_label");
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
     int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
@@ -189,15 +189,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       if (c == background_label) continue;
       Tensor score = scores.Slice(c, c + 1);
       NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &(indices[c]));
-      num_det += indices[c].size();
+              &((*indices)[c]));
+      num_det += (*indices)[c].size();
     }
 
-    num_nmsed_out = num_det;
+    *num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : indices) {
+      for (const auto& it : *indices) {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
@@ -220,13 +220,13 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
       }
-      new_indices.swap(indices);
-      num_nmsed_out = keep_top_k;
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
     }
   }
 
   void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                        std::map<int, std::vector<int>>& selected_indices,
+                        const std::map<int, std::vector<int>>& selected_indices,
                         Tensor* outs) const {
     int predict_dim = scores.dims()[1];
     auto* scores_data = scores.data<T>();
@@ -273,7 +273,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       std::map<int, std::vector<int>> indices;
       int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index ad623e1fe0f8941615b671a0c20bd3637ae6d407..8de974bc2b333fb6ccc5b5f0bb1af86533139925 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -135,8 +135,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
-          comm->comms().at(idx), ctx.cuda_device_context().stream()));
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
+          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
+          ctx.cuda_device_context().stream()));
       VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 99f38529bbb5a36cd944a01940b5579195f2d601..192bdf8ea553f3a82066f8562458d286ee15a6ee 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/nce_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -179,7 +181,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                        ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 94207638473374ddf7e23d211d6cde93f112f492..2c4c97f28bc0b511d6eaa8f79a3a4efc9be8a5da 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -108,7 +109,7 @@ class NCEKernel : public framework::OpKernel<T> {
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
               .sum();
       sample_out_data[i] += result(0);
@@ -190,7 +191,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
-            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
       }
     }
@@ -202,7 +203,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+        d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
     }
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5345c5bdb0f1e2d96233595f89028993606d2399..30a991224fa184257a8e59af5e6a27a0b0a4da86 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -85,8 +85,9 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
-            ops::NormOpGrad);
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
 REGISTER_OP_CPU_KERNEL(
     norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index b28c16b13fce30c6e9be9953009b53e722cf4885..ae34fe2184b43cc104c14672dec30efd3b0e9f3b 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -364,7 +364,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
     grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+    grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
   }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index b144ec5f7d315cb340dcd94b4a519bfcfd2a0e66..f2de075e0d82fc5bd0ac41b481ac80314f3857a3 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -333,18 +333,20 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4df0a14577ca13ddd79424fc324eb689913b20a0..848cd61b23c2389d3fe11f585b256d55c1ff177f 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -258,9 +258,10 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool2dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
@@ -272,11 +273,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
 
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool3dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
@@ -288,4 +290,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
index 5fc418b6fdd19eddfd27b4a1b3e2554d7b2f37e6..5497dcbd9ce255f833df24989d7a76c40bcbca06 100644
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -27,7 +27,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
 
 REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
@@ -40,4 +40,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 8eaa12a4a6cfc09fd4e2c3642bc8825fe2af6d6b..a066b3e06e5eca2661827425b5b2d0059d5bcc3c 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -83,8 +83,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
-            ops::PReluGradOp);
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
 REGISTER_OP_CPU_KERNEL(
     prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index fc09b4aa1da87e56678790785467e9f4080a20ea..fafc7e54d7a44d6bb2dadf67135537dc16430e76 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -23,15 +23,15 @@ namespace operators {
 
 #define CLOG std::cout
 
-const std::string kForward = "FORWARD";
-const std::string kBackward = "BACKWARD";
-const std::string kBoth = "BOTH";
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
 
 struct Formater {
   std::string message;
   std::string name;
   std::vector<int> dims;
-  std::type_index dtype{typeid(char)};
+  std::type_index dtype{typeid(const char)};
   framework::LoD lod;
   int summarize;
   void* data{nullptr};
@@ -62,7 +62,7 @@ struct Formater {
     }
   }
   void PrintDtype() {
-    if (dtype.hash_code() != typeid(char).hash_code()) {
+    if (dtype.hash_code() != typeid(const char).hash_code()) {
       CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
@@ -83,15 +83,15 @@ struct Formater {
   void PrintData(size_t size) {
     PADDLE_ENFORCE_NOT_NULL(data);
     // print float
-    if (dtype.hash_code() == typeid(float).hash_code()) {
+    if (dtype.hash_code() == typeid(const float).hash_code()) {
       Display<float>(size);
-    } else if (dtype.hash_code() == typeid(double).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const double).hash_code()) {
       Display<double>(size);
-    } else if (dtype.hash_code() == typeid(int).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const int).hash_code()) {
       Display<int>(size);
-    } else if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const int64_t).hash_code()) {
       Display<int64_t>(size);
-    } else if (dtype.hash_code() == typeid(bool).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const bool).hash_code()) {
       Display<bool>(size);
     } else {
       CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
@@ -100,7 +100,7 @@ struct Formater {
 
   template <typename T>
   void Display(size_t size) {
-    auto* d = (T*)data;
+    auto* d = reinterpret_cast<T*>(data);
     CLOG << "\tdata: ";
     if (summarize != -1) {
       summarize = std::min(size, (size_t)summarize);
@@ -135,7 +135,7 @@ class TensorPrintOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     const framework::Variable* in_var_ptr = nullptr;
-    std::string phase = kForward;
+    std::string phase(kForward);
     std::string printed_var_name = "";
 
     auto& inputs = Inputs();
@@ -146,7 +146,7 @@ class TensorPrintOp : public framework::OperatorBase {
                !Inputs("In@GRAD").empty()) {
       in_var_ptr = scope.FindVar(Input("In@GRAD"));
       printed_var_name = Inputs("In@GRAD").front();
-      phase = kBackward;
+      phase = std::string(kBackward);
     } else {
       PADDLE_THROW("Unknown phase, should be forward or backward.");
     }
@@ -163,7 +163,7 @@ class TensorPrintOp : public framework::OperatorBase {
     out_tensor.set_lod(in_tensor.lod());
 
     std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != kBoth) {
+    if (print_phase != phase && print_phase != std::string(kBoth)) {
       return;
     }
 
@@ -199,7 +199,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.lod = printed_tensor.lod();
     }
     formater.summarize = Attr<int>("summarize");
-    formater.data = (void*)printed_tensor.data<void>();
+    formater.data = reinterpret_cast<void*>(printed_tensor.data<void>());
     formater(printed_tensor.numel());
   }
 
@@ -223,8 +223,9 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
         "print_phase",
         "(string, default 'BOTH') Which phase to display including 'FORWARD' "
         "'BACKWARD' and 'BOTH'.")
-        .SetDefault(kBoth)
-        .InEnum({kForward, kBackward, kBoth});
+        .SetDefault(std::string(kBoth))
+        .InEnum({std::string(kForward), std::string(kBackward),
+                 std::string(kBoth)});
     AddOutput("Out", "Output tensor with same data as input tensor.");
     AddComment(R"DOC(
 Creates a print op that will print when a tensor is accessed.
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index a1127f11a75e54168ca9682a0189255d37ee8571..eb9ff8de3e4b37ef0bbf7477c1bb62856bdb6310 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -121,8 +121,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
-            ops::RankLossGradOp);
+REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 0b7c1d6af714558d35918dac62d92d9e0f86c970..4372f23fc1dbd85e43b04a9d644977392316c2e9 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -180,7 +180,8 @@ void DoubleBufferReader::PrefetchThreadFunc() {
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
       gpu_batch.resize(cpu_batch.size());
       for (size_t i = 0; i < cpu_batch.size(); ++i) {
-        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i],
+                              true);
         gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
     }
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index cbf709d5e734c0f2adf3735dc28043c1340349da..1cb9bd36455a2287b8ba4fb4ca14a4c5338da098 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,26 +21,16 @@ namespace reader {
 
 class ThreadedReader : public framework::DecoratedReader {
  public:
-  ThreadedReader(ReaderBase* reader, bool safe_mode)
-      : DecoratedReader(reader), safe_mode_(safe_mode) {}
+  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     std::lock_guard<std::mutex> lock(mutex_);
     reader_->ReadNext(out);
   }
 
-  void ReInit() override {
-    if (safe_mode_) {
-      PADDLE_THROW(
-          "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
-    }
-    VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
-               "multi-thread environment.";
-    reader_->ReInit();
-  }
+  void ReInit() override { reader_->ReInit(); }
 
  private:
-  bool safe_mode_;
   std::mutex mutex_;
 };
 
@@ -58,8 +48,7 @@ class CreateThreadedReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    bool safe_mode = Attr<bool>("safe_mode");
-    out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
+    out->Reset(new ThreadedReader(underlying_reader.Get()));
   }
 };
 
@@ -67,10 +56,6 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
  public:
   CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
       : DecoratedReaderMakerBase(op_proto, op_checker) {
-    AddAttr<bool>("safe_mode",
-                  "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
-                  "unexpected bugs in multi-thread environment.")
-        .SetDefault(true);
     AddComment(R"DOC(
       CreateThreadedReader Operator
 
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 00241e768217db0a611c00bbc72e2fb83ade73b4..72c2905872c528a7ed05820744f4031799ad9e46 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -596,7 +596,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
     grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    grad->SetBlockAttr(kStepBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
   }
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
index 7879367830216cdd875f9f95f95e2a88f282ac64..093db966472cf100b2f1e4159ce20399cee1f481 100644
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_op.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -122,18 +125,18 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
  protected:
   std::string comment_;
 
-  void Replace(std::string &src, std::string from, std::string to) {
+  void Replace(std::string *src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
+    Replace(&comment_, "{ReduceOp}", name);
+    Replace(&comment_, "{reduce}", op);
   }
 };
 
@@ -187,20 +190,25 @@ class ReduceProdOpMaker : public ReduceOpMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
-            reduce_prod_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
 
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
   REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index b28dd7f20968d762ffd669557500f788bda0d7bc..e42b4bfe42df05346020d4f48519fecf39aa37d2 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -35,77 +35,77 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.sum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
   }
 };
 
 struct SumGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim);
+    dx->device(place) = dy->broadcast(dim);
   }
 };
 
 struct MeanFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.mean(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
   }
 };
 
 struct MeanGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
   }
 };
 
 struct MaxFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.maximum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
   }
 };
 
 struct MinFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.minimum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto equals = x == y.broadcast(dim);
-    auto ones = dx.constant(1);
-    auto zeros = dx.constant(0);
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
     // If there are multiple minimum or maximum elements, the subgradient of
     // each is the set [0, 1], and we pass gradient to all of them here.
-    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
   }
 };
 
 struct ProdFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.prod(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
   }
 };
 
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) * y.broadcast(dim) * x.inverse();
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
   }
 };
 
@@ -125,7 +125,7 @@ class ReduceKernel : public framework::OpKernel<T> {
           *context.template device_context<DeviceContext>().eigen_device();
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       Functor functor;
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
       switch (rank) {
@@ -178,10 +178,10 @@ class ReduceKernel : public framework::OpKernel<T> {
 
     if (D == 1) {
       auto out = EigenScalar<T>::From(*output);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     }
   }
 };
@@ -206,7 +206,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       auto broadcast_dim =
           Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
       Functor functor;
-      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
@@ -258,7 +258,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
   }
 };
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 93f9c74b809770136d3d3300e0e0700b1bc0459e..5e5ccc3ded95d57dfed37c1ac9c7eae61d36b8c0 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -113,8 +113,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
-            ops::ReshapeGradOp);
+REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
 REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
                        ops::ReshapeKernel<CPU, double>,
                        ops::ReshapeKernel<CPU, int>,
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 6d4861f0428834b1893c3a10a83920f0a62b5455..224ec93d28ec75c52848d7c8400e684df0d69209 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -153,8 +153,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
-            ops::ROIPoolGradOp);
+REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d34beeb6508084f4d680fad9bac99ea474d274d3..23f720da0b68cd2fd4c9b51182bf82f72078a906 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -250,8 +250,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
-            ops::RowConvGradOp);
+REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 286f75df4ca2daff24b696c6bcb0c3df32875875..2773c32a0a10269e28c24e12527711e3c5b8f869 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -23,17 +23,17 @@ USE_NO_KERNEL_OP(load_combine);
 
 int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
                             std::string var_name,
-                            paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope& scope,
-                            paddle::framework::LoD& expect_lod) {
-  auto var = scope.Var(var_name);
+                            const paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope* scope,
+                            paddle::framework::LoD* expect_lod) {
+  auto var = scope->Var(var_name);
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({x, y});
-  expect_lod.resize(1);
+  expect_lod->resize(1);
   for (size_t i = 0; i < lod_info.size(); i++) {
-    expect_lod[0].push_back(lod_info[i]);
+    (*expect_lod)[0].push_back(lod_info[i]);
   }
-  tensor->set_lod(expect_lod);
+  tensor->set_lod(*expect_lod);
   int* expect = tensor->mutable_data<int>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<int>(i);
@@ -42,17 +42,17 @@ int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
 }
 
 paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope& scope) {
-  auto load_var = scope.Var(out_var_name);
+    const std::string out_var_name, paddle::framework::Scope* scope) {
+  auto load_var = scope->Var(out_var_name);
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   return target;
 }
 
 int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 paddle::framework::Scope& scope,
-                                 paddle::framework::LoD& actual_lod) {
+                                 const paddle::framework::Scope& scope,
+                                 paddle::framework::LoD* actual_lod) {
   int* actual = target->data<int>();
-  actual_lod = target->lod();
+  *actual_lod = target->lod();
   return actual;
 }
 
@@ -78,26 +78,26 @@ TEST(SaveLoadCombineOp, CPU) {
   std::vector<int> lod1 = {0, 1, 2, 3, 10};
   int numel1 = 100;
   paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
-                                        expect_lod1);
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place,
+                                        &scope, &expect_lod1);
 
   std::vector<int> lod2 = {0, 2, 5, 10};
   int numel2 = 200;
   paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
-                                        expect_lod2);
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place,
+                                        &scope, &expect_lod2);
 
   std::vector<int> lod3 = {0, 2, 3, 20};
   int numel3 = 4000;
   paddle::framework::LoD expect_lod3;
   int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        scope, expect_lod3);
+                                        &scope, &expect_lod3);
 
   std::vector<int> lod4 = {0, 1, 20};
   int numel4 = 1000;
   paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
-                                        expect_lod4);
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place,
+                                        &scope, &expect_lod4);
 
   // Set attributes
   std::string filename = "check_tensor.ls";
@@ -111,10 +111,10 @@ TEST(SaveLoadCombineOp, CPU) {
   save_combine_op->Run(scope, place);
 
   // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
 
   // Run the load_combine_op
   auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
@@ -123,10 +123,10 @@ TEST(SaveLoadCombineOp, CPU) {
   load_combine_op->Run(scope, place);
 
   paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4);
 
   CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
   CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index d6fd6214711f4ee66b1daffa4db2e84aa7201e79..95b12455ea4996f00bab8a353ccd425b2c37aed1 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -102,7 +102,8 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
+REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
index c0bf0ff927481bc4da9cd6c4bb9b0c4a6841c891..876d8acf0d880a7ef806514014d297f98e04c53d 100644
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <boost/tokenizer.hpp>
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -22,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concurrency/channel_util.h"
 
+#include <boost/tokenizer.hpp>
+
 namespace paddle {
 namespace operators {
 
@@ -254,8 +255,8 @@ class SelectOp : public framework::OperatorBase {
       auto selectCond = std::make_shared<std::condition_variable_any>();
 
       std::recursive_mutex callbackMutex;
-      pushThreadOnChannelQueues(scope, cases, selectCond, caseToExecute,
-                                completed, callbackMutex);
+      pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
+                                &completed, &callbackMutex);
 
       // TODO(thuan): Atomically unlock all channels and sleep current thread
       unlockChannels(channels);
@@ -302,8 +303,8 @@ class SelectOp : public framework::OperatorBase {
       const framework::Scope *scope,
       std::vector<std::shared_ptr<SelectOpCase>> *cases,
       std::shared_ptr<std::condition_variable_any> rCond,
-      std::atomic<int> &caseToExecute, std::atomic<bool> &completed,
-      std::recursive_mutex &callbackMutex) const {
+      std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
+      std::recursive_mutex *callbackMutex) const {
     std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
     while (it != cases->end()) {
       std::shared_ptr<SelectOpCase> c = *it;
@@ -315,17 +316,17 @@ class SelectOp : public framework::OperatorBase {
       std::function<bool(framework::ChannelAction channelAction)> cb =
           [&caseToExecute, &completed, &callbackMutex,
            c](framework::ChannelAction channelAction) {
-            std::lock_guard<std::recursive_mutex> lock{callbackMutex};
+            std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
 
             bool canProcess = false;
-            if (!completed) {
+            if (!(*completed)) {
               // If the channel wasn't closed, we set the caseToExecute index
               // as this current case
               if (channelAction != framework::ChannelAction::CLOSE) {
-                caseToExecute = c->caseIndex;
+                *caseToExecute = c->caseIndex;
               }
               // This will allow our conditional variable to break out of wait
-              completed = true;
+              *completed = true;
               canProcess = true;
             }
 
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index a342874f97460cf624ff0047915d33ba4161f19b..81350fee38df058d1b63eb5a8cd0b770e0626ae4 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -127,7 +127,7 @@ void StartServerNet(bool is_sparse) {
   const auto &root_block = program.Block(0);
   auto *optimize_block = program.AppendBlock(root_block);
   auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensers, must be of same shape.
+  // X for server side tensors, RX for received tensors, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
   f::AttributeMap attrs;
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 126753edd09e8bd0f9d5a08936afbc6326b29ace..3c21903e3a08dcfb55c6c07370a117d0ad633e69 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -124,9 +124,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
-               ops::SequenceConcatGradOp, false);
+REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
+                  ops::SequenceConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index ec1f3a5da8c1fc8933b3720802ea901695195dec..94f4b49b0018fdbff6e67c3c081aa5706ccb2e66 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_conv_op.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace operators {
 
@@ -174,8 +176,9 @@ context_length, context_stride and context_start.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
+REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
index b59504bb9893b720247841bdad5aa577992b7fb6..3916cdbb6a69c5a18f7a21ec60bad2732b4c3e58 100644
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -33,7 +33,6 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     auto filter = *context.Input<Tensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
-    context.ShareLoD("X", "Out");
 
     int context_start = context.Attr<int>("contextStart");
     int context_length = context.Attr<int>("contextLength");
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index ae52849162ae4d78cc69ddbb98f58059f55683cb..84a35d7172a567a3f6505559fa45a32290288533 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -200,8 +200,10 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
+                  ops::SequenceExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
     ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
index d09e5bca56b226100d2d0cf3a030c77703bfa76e..7cd620af07fa9b5f8fcee3c0f88207ef2800c4a1 100644
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -120,8 +120,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
+                  ops::SequenceSliceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 5661f4b42f37fed7f589c515e25fd66cfcede2c7..0ddacb57106c090e8f4f9350a65a30ca102f8e0a 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -99,7 +99,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxCUDNNKernel<float>,
-                   ops::SequenceSoftmaxCUDNNKernel<double>)
+                   ops::SequenceSoftmaxCUDNNKernel<double>);
 REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxGradCUDNNKernel<float>,
-                   ops::SequenceSoftmaxGradCUDNNKernel<double>)
+                   ops::SequenceSoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index d2c1317bef95deca36f7f4198407f5350a1be035..a0d47c12ba606eb62bbbea4d5ea793ce915e8100 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -155,9 +155,10 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
-            ops::SequenceSoftmaxGradOp);
+REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
+                  ops::SequenceSoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
index 57adea3a1b9dbcbb5787d005e4d3ec595f61d4b2..397df75415691e4f53bc399cd1868c3e37bc9110 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>)
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
     ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index cfc8793e1e05a7d4fa9207ae77a664b391b9a986..f3e88b0a0b05ef792b2cc8e880bdfddb6e6124d1 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -107,7 +107,9 @@ class SGDOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < grad.rows().size(); i++) {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        int64_t id_index = param.index(grad.rows()[i]);
+        int64_t id_index = param.Index(grad.rows()[i]);
+        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
+                          "id should be in the table");
         for (size_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 7b93f19bb2f7102824852aa181e3728f79025121..5db77d0493fc0abaa0a696cb559c3ca0534d4101 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -135,11 +135,12 @@ However the output only shares the LoD with input `X`.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
+                  ops::SigmoidCrossEntropyWithLogitsOp,
+                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
+                  ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
                            paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 658eb0195212cc3038fce6aab0ec3804efc59edf..322581fdef27b12a06704abc9c3b8772adf002f2 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -132,8 +132,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
+REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
     ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index e1f286f9ba42ff22fffbfc012832dd751a37c1d0..2741ba95bcfc1db3d74e0fb8c3f6fddf7d5a2caa 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -160,8 +160,9 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
+REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7413ce3e9ce60ed733bb4d27e9ec205e5f0a7e1b
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ee6186f3541b7dcb845ce0c6d28081685925da0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedd7218dd6cc9481e94a92a3820cafbe4157bd0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    size_t row_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
+      row_offset += out->dims()[0];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index e745509ec8c1f2ec305d7d4aabfdd43d847124b5..a4398df36bcc2d3b8bbe8949f27f5d6508861d95 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -108,21 +108,6 @@ Example:
   }
 };
 
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index e2c41f44ab3ea3c42837974dae749278c9356ba5..f0c417c70521b1bb3816f884d6ab7393473999e4 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
   }
 };
 
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b..1cada95501a76da27081d533b451ce7f6a384a49 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -92,7 +92,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
 REGISTER_OP_CPU_KERNEL(
     spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 1c5e87040a8dd74b98d8e31bfe351ea256e01f15..c32f575b541d6a6441cc1b6e999496eacef421a5 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -109,9 +109,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
-            ops::SquaredL2DistanceGradOp);
+REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
+                  ops::SquaredL2DistanceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index b64df2a218860be3adb3954e07b036c05bf05c8e..4ce51259da3530367d91b5da34f06fbe5d969fce 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -67,8 +67,10 @@ $$Out = \sum_{i} X_{i}^2$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
+                  ops::SquaredL2NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
     ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 4aea9cd65bed615c84c95d891a0a4092678e1444..3555cb68cab97c0cf983f1173c3b4ca9307e4f7d 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -118,8 +118,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
-            ops::TransposeOpGrad);
+REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 31859fd1d70dc6e6387258cd5f7412e78a302567..b3cd87efa21115565b32659cb35fee4b5bed2d4f 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -132,8 +132,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
-            ops::UnpoolOpGrad);
+REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
     unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 940bf4fe7baa6a01a2143374b502c61d0b55fd77..6835a5dd6286ece20c4ce6f3e951ed4b0057012c 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -132,8 +132,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
-            ops::WarpCTCGradOp);
+REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 8b62b242cf8745378eb216db10605388b294ca75..710cc9fc2e716da2e4fd067562a34d312e48b1a1 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -288,7 +288,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     while_grad->SetInput(framework::GradVarName(kOutputs), output_grads_list);
 
     while_grad->SetAttrMap(this->Attrs());
-    while_grad->SetBlockAttr(kStepBlock, *grad_block);
+    while_grad->SetBlockAttr(kStepBlock, grad_block);
     // record the original output gradient names, since the gradient name of
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 917bdc64abf608b8ade70c47f76a8adffb32046a..598fd4d419078a973647f2f8f20e8a12c8115a8b 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -12,7 +12,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 if(WITH_GPU)
-  cc_library(enforce SRCS enforce.cc DEPS)
+  nv_library(enforce SRCS enforce.cc)
 else()
   cc_library(enforce SRCS enforce.cc)
 endif()
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index aaebeb1353a13ab16fcf98f10da59d41fd2f5b48..4cee93f3a4224cb97327254cd1679021d197a1b1 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -127,11 +127,24 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
 
-void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
-                   size_t count, cudaStream_t stream) {
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(
       cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count),
+      "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 36345e17406e22970806fa274d5a73a703517c43..f4640d3eaa2165c35e8e14690d83e9e7e7168c0b 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -57,9 +57,17 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
 
-//! Copy memory from one device to another device.
-void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
-                   size_t count, cudaStream_t stream);
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device asynchronously.
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, cudaStream_t stream);
+
+//! Copy memory from one device to another device synchronously.
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count);
 
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index de8056237fb022f62488e0fedf9a4f67e4601072..23f1d615daab91f0e4b353bc7d9a3ca7f5cec5ae 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <mkldnn.h>
 #include <vector>
-
-#include "mkldnn/include/mkldnn.hpp"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -34,6 +33,32 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
 
+template <typename Type>
+void* to_void_cast(const Type* t) {
+  return static_cast<void*>(const_cast<Type*>(t));
+}
+
+template <class Type>
+using tf_desc = typename Type::desc;
+
+template <class Type>
+using tf_pd = typename Type::primitive_desc;
+
+template <typename Type, typename Engine, typename... Args>
+std::shared_ptr<tf_pd<Type>> MKLDNNFwdPrimitiveDesc(const Engine& e,
+                                                    Args&&... args) {
+  auto desc = tf_desc<Type>(mkldnn::prop_kind::forward, (args)...);
+  auto pd = new tf_pd<Type>(desc, e);
+  return std::shared_ptr<tf_pd<Type>>(pd);
+}
+
+template <typename Type, typename Engine, typename Primitive, typename... Args>
+tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
+                                   Args&&... args) {
+  auto desc = tf_desc<Type>(args...);
+  return tf_pd<Type>(desc, e, p);
+}
+
 inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
                                           mkldnn::memory::data_type data_type,
                                           mkldnn::memory::format format) {
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 93533e5c9d88a9113d4d3eacb01901a8c14b6324..6471eb3ab7bf05365c0bb2bf68bb74ef9044c527 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -127,6 +127,9 @@ void BindProgramDesc(pybind11::module *m) {
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("flush", &pd::ProgramDesc::Flush)
+      .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
+      .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
       .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
            [](pd::ProgramDesc &program_desc, const std::string &data) {
@@ -299,6 +302,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("check_attrs", &pd::OpDesc::CheckAttrs)
       .def("infer_shape", &pd::OpDesc::InferShape)
       .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
       .def("block", &pd::OpDesc::Block,
            pybind11::return_value_policy::reference);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 19bd30d9665dc1e8f9d475868cabbf14c8847352..b20b514fcdd0b41fefa0933bc2d22645e7d4b6d6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -294,7 +295,7 @@ All parameter, weight, gradient are variables in Paddle.
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
     }
     proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
@@ -461,6 +462,9 @@ All parameter, weight, gradient are variables in Paddle.
         self.back().set_lod(t.lod());
       });
 
+  m.def("IsInplace",
+        [](std::string op) -> bool { return operators::IsInplace(op); });
+
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
@@ -498,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle.
               const std::unordered_set<std::string> &bcast_vars,
               const ProgramDesc &main_program, const std::string &loss_var_name,
               Scope *scope, std::vector<Scope *> &local_scopes,
-              bool allow_op_delay) {
-             new (&self)
-                 ParallelExecutor(num_threads, use_event, places, params,
-                                  bcast_vars, main_program, loss_var_name,
-                                  scope, local_scopes, allow_op_delay);
+              bool allow_op_delay, bool customize_loss_grad) {
+             new (&self) ParallelExecutor(num_threads, use_event, places,
+                                          params, bcast_vars, main_program,
+                                          loss_var_name, scope, local_scopes,
+                                          allow_op_delay, customize_loss_grad);
            })
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
diff --git a/paddle/fluid/recordio/README.md b/paddle/fluid/recordio/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef99c0cf0fa71d807a95898454d8fabb287324e9
--- /dev/null
+++ b/paddle/fluid/recordio/README.md
@@ -0,0 +1,13 @@
+## Background
+
+The RecordIO file format is a container for records.  This package is a C++ implementation of https://github.com/paddlepaddle/recordio, which originates from https://github.com/wangkuiyi/recordio.
+
+## Fault-tolerant Writing
+
+For the initial design purpose of RecordIO within Google, which was logging, RecordIO groups record into *chunks*, whose header contains an MD5 hash of the chunk.  A process that writes logs is supposed to call the Writer interface to add records.  Once the writer accumulates a handful of them, it groups a chunk, put the MD5 into the chunk header, and appends the chunk to the file.  In the event the process crashes unexpected, the last chunk in the RecordIO file could be incomplete/corrupt. The RecordIO reader is able to recover from these errors when the process restarts by identifying incomplete chucks and skipping over them.
+
+## Reading Ranges
+
+A side-effect of chunks is to make it easy to indexing records while reading, thus allows us to read a range of successive records.  This is good for distributed log process, where each MapReduce task handles only part of records in a big RecordIO file.
+
+The procedure that creates the index starts from reading the header of the first chunk. It indexes the offset (0) and the size of the chunk, and skips to the header of the next chunk by calling the `fseek` API. Please be aware that most distributed filesystems and all POSIX-compatible local filesystem provides `fseek`, and makes sure that `fseek` runs much faster than `fread`.  This procedure generates a map from chunks to their offsets, which allows the readers is to locate and read a range of records.
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index e3e4457f9b72c5edb8082fdf378ae662b4aee42f..b4215bb307cc31ce64bb724986b88fdc20bbbf45 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -390,9 +390,7 @@ private:
 
       if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
+        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
       }
 
       {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 3f46cc98cdef17d14c253c732814bcba005fd667..b8d4d28f0f309a5f7348605e8d35e160e7fd5552 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -52,7 +52,7 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
   } else {
     numDevices_ = 0;
   }
-  ParamInitCallback mainParamInitCb = [this](int paramId, Parameter* para) {
+  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
     // only create buf for CPU parameters
     // GPU parameters will be created in each thread
     if (para->useGpu()) return;
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 27e8b5868e6d85cf004945d7cb086d6d57487f9f..44b57185c5a5fa7703ca477b990a73cdad2c2aa1 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -72,7 +72,7 @@ void RecurrentLayerGroup::initSubNetwork(
   setNeedGradient(true);
 
   network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [this, rootNetwork](int paramId, Parameter* para) {
+  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
     para->enableSharedType(
         PARAMETER_VALUE,
         rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index cfdaf8998b04e0307bc442dec0df734452634c67..94522f718a0c19bfc704ca92eddef5c5a9cb6919 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -325,12 +325,12 @@ void Argument::concat(const std::vector<Argument>& args,
         ->copyFrom(*src->subVec(srcStartRow, size), stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
-                                      const SVectorPtr& src,
-                                      int desStartRow,
-                                      int srcStartRow,
-                                      int size,
-                                      bool useGpu) {
+  auto copyStrs = [batchSize](SVectorPtr& dst,
+                              const SVectorPtr& src,
+                              int desStartRow,
+                              int srcStartRow,
+                              int size,
+                              bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -413,7 +413,7 @@ void Argument::concat(const std::vector<Argument>& args,
     dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
   };
 
-  auto copyStrs = [batchSize, stream](
+  auto copyStrs = [batchSize](
       SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
index 75998d81dd9c8be35fe45e903dc1cd69068f83c6..82a7fed6c6451b8908851f2d039f17b9dc513818 100644
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -81,9 +81,9 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
   if (numUpdates_ % kMaxNumAccumulates == 0) {
     // Move the sum to a different buffer to avoid loss of precision
     // due to too many sums.
-    callbacks.emplace_back([this](const VectorPtr vecs[],
-                                  const ParameterConfig& config,
-                                  size_t sparseId) {
+    callbacks.emplace_back([](const VectorPtr vecs[],
+                              const ParameterConfig& config,
+                              size_t sparseId) {
       vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
       vecs[PARAMETER_SUM1]->zeroMem();
     });
@@ -94,9 +94,9 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
     if (auto callback = this->startCatchUpWith()) {
       callbacks.emplace_back(callback);
     }
-    callbacks.emplace_back([this](const VectorPtr vecs[],
-                                  const ParameterConfig& config,
-                                  size_t sparseId) {
+    callbacks.emplace_back([](const VectorPtr vecs[],
+                              const ParameterConfig& config,
+                              size_t sparseId) {
       vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
       vecs[PARAMETER_SUM1]->zeroMem();
       vecs[PARAMETER_SUM2]->zeroMem();
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index 5e280bcac3389179181d2eda58c08e579e867ecc..182e833405e8f8bc3a4c9ffddbf628040f9cceaa 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -145,9 +145,9 @@ AdagradParameterOptimizer::needSpecialTraversal(
   if (numUpdates_ % kMaxNumAccumulates == 0) {
     // Move the sum to a different buffer to avoid loss of precision
     // due to too many sums.
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
+    return [](const VectorPtr vecs[],
+              const ParameterConfig& config,
+              size_t sparseId) {
       vecs[PARAMETER_GRADIENT_SQURESUM]->add(
           *vecs[PARAMETER_GRADIENT_SQURESUM1]);
       vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2b2a904974f3756576fb47851400e344c9357c57..94628270228b9e7fd32405bdcb5e11c163ba4791 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,7 @@ EOF
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
     if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04"
     else
     BASE_IMAGE="ubuntu:16.04"
     fi
@@ -164,7 +164,7 @@ function gen_dockerfile() {
     DOCKERFILE_CUDNN_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
         DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
     fi
 
     cat <<EOF
@@ -207,8 +207,14 @@ EOF
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
-    ADD go/cmd/pserver/pserver /usr/bin/
-    ADD go/cmd/master/master /usr/bin/
+EOF
+    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
+        cat >> /paddle/build/Dockerfile <<EOF
+        ADD go/cmd/pserver/pserver /usr/bin/
+        ADD go/cmd/master/master /usr/bin/
+EOF
+    fi
+    cat >> /paddle/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD [${CMD}]
 EOF
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c7c0812fe2238d48903aa4c75bb8f1e9ecdb16c9..ea25f3ab351ca1feb085a8fbbfe53d8cee397bbf 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -46,11 +46,11 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-
-add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
+set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -61,7 +61,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
 if(NOT WITH_FLUID_ONLY)
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 6a56e9d5563c76ab6f524ccea9191693dc227010..fbfa477d055eb5f484989eacce38cee8d617d729 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -19,7 +19,11 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
 
+import os
+
 import numpy as np
+import tempfile
+import tarfile
 import os
 import paddle.dataset.common
 
@@ -34,8 +38,9 @@ feature_names = [
 
 UCI_TRAIN_DATA = None
 UCI_TEST_DATA = None
-URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
-MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
+
+FLUID_URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fluid/fit_a_line.fluid.tar'
+FLUID_MD5_MODEL = '6e6dd637ccd5993961f68bfbde46090b'
 
 
 def feature_range(maximums, minimums):
@@ -113,6 +118,29 @@ def test():
     return reader
 
 
+def fluid_model():
+    parameter_tar = paddle.dataset.common.download(
+        FLUID_URL_MODEL, 'uci_housing', FLUID_MD5_MODEL, 'fit_a_line.fluid.tar')
+
+    tar = tarfile.TarFile(parameter_tar, mode='r')
+    dirpath = tempfile.mkdtemp()
+    tar.extractall(path=dirpath)
+
+    return dirpath
+
+
+def predict_reader():
+    """
+    It returns just one tuple data to do inference.
+
+    :return: one tuple data
+    :rtype: tuple 
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+    return (UCI_TEST_DATA[0][:-1], )
+
+
 def fetch():
     paddle.dataset.common.download(URL, 'uci_housing', MD5)
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index e9ca0d45f98bd27692a15060310d4e8cd1e8b181..e2502990d5b78eb0db7bdfd0c8ef9fb6688016df 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -107,7 +107,8 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'eager_delete_scope'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += ['fraction_of_gpu_memory_to_use']
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index aa15392d7e4901e8ee23ad5b4370542232adc2a5..d07e0f696e79cfb98efc09a9f40d7961678b6af4 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -18,7 +18,7 @@ import math
 
 import distributed_splitter as splitter
 import framework
-from framework import Program, default_main_program, Variable
+from framework import Program, default_main_program, Variable, Parameter
 from . import core
 
 LOOKUP_TABLE_TYPE = "lookup_table"
@@ -222,8 +222,14 @@ class DistributeTranspiler:
 
         # step1: For large parameters and gradients, split them into smaller
         # blocks.
-        param_list = [pg[0] for pg in params_grads]
-        grad_list = [pg[1] for pg in params_grads]
+        param_list = []
+        grad_list = []
+        for p, g in params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            param_list.append(p)
+            grad_list.append(g)
 
         if self.has_distributed_lookup_table:
             param_list = [
@@ -362,21 +368,19 @@ class DistributeTranspiler:
             else:
                 recv_inputs.append(single_trainer_var)
 
-        # step3
-        optimize_block = pserver_program.create_block(0)
-        # step 4
+        # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2
+        # step 3.2
         # Iterate through the ops and append optimize op which
         # located on current pserver
         opt_op_on_pserver = []
         for _, op in enumerate(self.optimize_ops):
             if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
                 opt_op_on_pserver.append(op)
-        # step 4.3
+        # step 3.3
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then
         # append it into the sub program.
@@ -409,28 +413,30 @@ class DistributeTranspiler:
             else:
                 self._append_pserver_non_opt_ops(block, op)
 
-        append_block = optimize_block
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
         if len(lr_ops) > 0:
+            lr_decay_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(append_block, op)
-
-            append_block = pserver_program.create_block(append_block.idx)
+                self._append_pserver_non_opt_ops(lr_decay_block, op)
 
         # append op to the current block
-        per_opt_block = append_block
-        for _, opt_op in enumerate(opt_op_on_pserver):
+        pre_block_idx = pserver_program.num_blocks - 1
+        for idx, opt_op in enumerate(opt_op_on_pserver):
+            per_opt_block = pserver_program.create_block(pre_block_idx)
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
-                if ufind.is_connected(op, opt_op) and \
-                    op not in global_ops:
+                if ufind.is_connected(op, opt_op) and op not in global_ops:
                     __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(append_block.idx)
 
         # append global ops
-        for glb_op in global_ops:
-            __append_optimize_op__(glb_op, per_opt_block)
+        opt_state_block = None
+        if global_ops:
+            opt_state_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
+            for glb_op in global_ops:
+                __append_optimize_op__(glb_op, opt_state_block)
 
         # NOT USED: single block version:
         #
@@ -444,10 +450,10 @@ class DistributeTranspiler:
         prefetch_block = None
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
-            self._create_table_optimize_block(pserver_index, pserver_program,
-                                              append_block)
+            table_opt_block = self._create_table_optimize_block(
+                pserver_index, pserver_program, pre_block_idx)
             prefetch_block = self._create_prefetch_block(
-                pserver_index, pserver_program, optimize_block)
+                pserver_index, pserver_program, table_opt_block)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
@@ -463,7 +469,7 @@ class DistributeTranspiler:
             inputs={'X': recv_inputs},
             outputs={},
             attrs={
-                "OptimizeBlock": optimize_block,
+                "OptimizeBlock": pserver_program.block(1),
                 "endpoint": endpoint,
                 "Fanin": self.trainer_num,
                 "PrefetchBlock": prefetch_block
@@ -656,7 +662,7 @@ class DistributeTranspiler:
         return prefetch_block
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     append_block):
+                                     pre_block_idx):
         def _clone_var(block, var, persistable=True):
             assert isinstance(var, Variable)
             return block.create_var(
@@ -693,7 +699,7 @@ class DistributeTranspiler:
             op for op in self.optimize_ops
             if op.input("Param")[0] == self.table_name
         ][0]
-        table_opt_block = pserver_program.create_block(append_block.idx)
+        table_opt_block = pserver_program.create_block(pre_block_idx)
         # only support sgd now
         assert table_opt_op.type == "sgd"
 
@@ -717,6 +723,8 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=table_opt_op.attrs)
 
+        return table_opt_block
+
     # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
@@ -824,7 +832,7 @@ class DistributeTranspiler:
                 for v in splited_vars:
                     sections.append(v.shape[0])
                 program.global_block().append_op(
-                    type="split",
+                    type="split_byref",
                     inputs={"X": orig_var},
                     outputs={"Out": splited_vars},
                     attrs={"sections": sections}  # assume split evenly
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 13475025b5c2a759779066f9d511ed8a786118d5..1ee1d3727174c079d2c217dede27ff1a0316c01c 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -22,7 +22,6 @@ from layer_helper import LayerHelper
 from initializer import Constant
 
 __all__ = [
-    'Accuracy',
     'ChunkEvaluator',
     'EditDistance',
     'DetectionMAP',
@@ -273,7 +272,7 @@ class DetectionMAP(Evaluator):
         input (Variable): The detection results, which is a LoDTensor with shape
             [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
         gt_label (Variable): The ground truth label index, which is a LoDTensor
-            with shape [N, 1]. 
+            with shape [N, 1].
         gt_difficult (Variable): Whether this ground truth is a difficult
             bounding box (bbox), which is a LoDTensor [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7..7ad028714d3b47d93328dbf7c3297d55a2db1bd0 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -151,7 +151,7 @@ def fetch_var(name, scope=None, return_numpy=True):
         scope = global_scope()
     assert isinstance(scope, core.Scope)
 
-    var = global_scope().find_var(name)
+    var = scope.find_var(name)
     assert var is not None, (
         "Cannot find " + name + " in scope. Perhaps you need to make the"
         " variable persistable by using var.persistable = True in your"
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4b841ef31dcb67ab660475cf6e231fd8a4ae83d6..340882ea9e7b0e2a0c52749c771308c6b860ed07 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1070,6 +1070,12 @@ class Program(object):
         for t in targets:
             if not isinstance(t, Operator):
                 if isinstance(t, Variable):
+                    if t.op is None:
+                        global_block = self.global_block()
+                        for op in global_block.ops:
+                            if t.name in op.output_arg_names:
+                                t.op = op
+                                break
                     t = t.op
                 else:
                     raise ValueError(("All targets of prune() can only be "
@@ -1101,6 +1107,10 @@ class Program(object):
     def random_seed(self):
         return self._seed
 
+    @property
+    def num_blocks(self):
+        return self.desc.num_blocks()
+
     @random_seed.setter
     def random_seed(self, seed):
         if not isinstance(seed, int):
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1c0f1f6eb415b1c05c1052c1f52743a19c49f017..f7f1ca2598a3e679b24fa8d62c52e4f4de788fe2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -336,11 +336,20 @@ def save_inference_model(dirname,
 
     if main_program is None:
         main_program = default_main_program()
+    copy_program = main_program
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = main_program.prune(targets=target_vars)
+    # Clear the is_target information and remove the existed feed and fetch op
+    global_block = copy_program.global_block()
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            global_block.remove_op(i)
+    copy_program.desc.flush()
+
+    pruned_program = copy_program.prune(targets=target_vars)
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
@@ -362,24 +371,6 @@ def save_inference_model(dirname,
     save_persistables(executor, dirname, inference_program, params_filename)
 
 
-def get_feed_targets_names(program):
-    feed_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'feed':
-            feed_targets_names.insert(0, op.desc.output('Out')[0])
-    return feed_targets_names
-
-
-def get_fetch_targets_names(program):
-    fetch_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'fetch':
-            fetch_targets_names.append(op.desc.input('X')[0])
-    return fetch_targets_names
-
-
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
@@ -418,8 +409,8 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
-    feed_target_names = get_feed_targets_names(program)
-    fetch_target_names = get_fetch_targets_names(program)
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
         program.global_block().var(name) for name in fetch_target_names
     ]
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index d771837fc545167f7c32fcf914dd1c3c3ae64fb3..62933b512529bd04fab8c9ded12e636ecfae685c 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
 import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr, WeightNormParamAttr
+import core
 
 
 class LayerHelper(object):
@@ -398,13 +399,16 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
 
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
+        tmp = input_var
+        # NOTE(dzhwinter): some activation support inplace compution.
+        if not core.IsInplace(act_type):
+            tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index ead57ac370d1bec13c1b21e83dd4be1a7331f87e..cc71c2136a6756ff094f6e06b8e200c6a68db06a 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,7 +21,7 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
 ]
 
 
@@ -290,7 +290,7 @@ def open_recordio_file(filename,
                        lod_levels,
                        dtypes,
                        pass_num=1,
-                       for_parallel=False):
+                       for_parallel=True):
     """
     Open a RecordIO file
 
@@ -364,7 +364,7 @@ def open_files(filenames,
                thread_num,
                buffer_size=None,
                pass_num=1,
-               for_parallel=False):
+               for_parallel=True):
     """
     Open files
 
@@ -457,8 +457,8 @@ def __create_shared_decorated_reader__(op_type, reader, attrs):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def __create_unshared_decorated_reader__(op_type, reader, attrs):
-    new_reader_name = unique_name(op_type)
+def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
+    new_reader_name = name if name is not None else unique_name(op_type)
     main_blk = default_main_program().current_block()
     new_reader = main_blk.create_var(name=new_reader_name)
     main_blk.append_op(
@@ -476,12 +476,17 @@ def shuffle(reader, buffer_size):
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
-def double_buffer(reader, place=None):
+def batch(reader, batch_size):
+    return __create_unshared_decorated_reader__(
+        'create_batch_reader', reader, {'batch_size': int(batch_size)})
+
+
+def double_buffer(reader, place=None, name=None):
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
-    return __create_unshared_decorated_reader__('create_double_buffer_reader',
-                                                reader, attrs)
+    return __create_unshared_decorated_reader__(
+        'create_double_buffer_reader', reader, attrs, name=name)
 
 
 def multi_pass(reader, pass_num):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 752f4689befd791da5c5c9626ffec3331f448f41..9a0c328033cdfdae39da050fc482abba17032dd9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -159,67 +159,37 @@ def fc(input,
     dtype = helper.input_dtype()
 
     mul_results = []
-    if use_mkldnn:
-        tmp = helper.create_tmp_variable(dtype)
-        input_shape = input.shape
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
         w = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            is_bias=False)
-        if bias_attr is None or bias_attr is False:
-            bias_attr = False
-        else:
-            bias_attr = True
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
-            type="fc",
-            inputs={"Input": input,
-                    "W": w},
+            type="mul",
+            inputs={"X": input_var,
+                    "Y": w},
             outputs={"Out": tmp},
             attrs={
-                "use_mkldnn": use_mkldnn,
-                "is_test": is_test,
-                "bias_attr": bias_attr
+                "x_num_col_dims": num_flatten_dims,
+                "y_num_col_dims": 1,
+                "use_mkldnn": use_mkldnn
             })
-        return helper.append_activation(tmp)
+        mul_results.append(tmp)
+
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
     else:
-        for input_var, param_attr in helper.iter_inputs_and_params():
-            input_shape = input_var.shape
-            param_shape = [
-                reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-            ] + [size]
-
-            w = helper.create_parameter(
-                attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-            tmp = helper.create_tmp_variable(dtype)
-            helper.append_op(
-                type="mul",
-                inputs={"X": input_var,
-                        "Y": w},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": num_flatten_dims,
-                    "y_num_col_dims": 1,
-                })
-            mul_results.append(tmp)
-
-        if len(mul_results) == 1:
-            pre_bias = mul_results[0]
-        else:
-            pre_bias = helper.create_tmp_variable(dtype)
-            helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias})
-        # add bias
-        pre_activation = helper.append_bias_op(
-            pre_bias, dim_start=num_flatten_dims)
-        # add activation
-        return helper.append_activation(pre_activation)
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
+    # add activation
+    return helper.append_activation(pre_activation)
 
 
 def embedding(input,
@@ -2688,7 +2658,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
         helper.append_op(
             type="sequence_erase",
             inputs={"X": [label]},
-            outputs={"Out": [erase_label]},
+            outputs={"Out": [erased_label]},
             attrs={"tokens": ignored_tokens})
         label = erased_label
 
@@ -3733,8 +3703,8 @@ def label_smooth(label,
                  name=None):
     """
     Label smoothing is a mechanism to regularize the classifier layer and is
-    called label-smoothing regularization (LSR). 
-    
+    called label-smoothing regularization (LSR).
+
     Label smoothing is proposed to encourage the model to be less confident,
     since optimizing the log-likelihood of the correct label directly may
     cause overfitting and reduce the ability of the model to adapt. Label
@@ -3758,10 +3728,10 @@ def label_smooth(label,
         prior_dist(Variable): The prior distribution to be used to smooth
                               labels. If not provided, an uniform distribution
                               is used. The shape of :attr:`prior_dist` should
-                              be :math:`(1, class\_num)`. 
+                              be :math:`(1, class\_num)`.
         epsilon(float): The weight used to mix up the original ground-truth
                         distribution and the fixed distribution.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, 
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32,
                                                   float_64, int etc.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 36503cac6d5391821b977d90e6b77c4df7e3b564..9ae43b3e93e4b7d337097a25379720c18dfd331c 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -27,7 +27,8 @@ from contextlib import contextmanager
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
-    'Adadelta', 'ModelAverage'
+    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage'
 ]
 
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index fbdd6fd449625a21f91758dc12490b02070aea1a..4adbb2ea99b58c78c5c08c7ac8a556ca1de1615e 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,7 +29,8 @@ class ParallelExecutor(object):
                  main_program=None,
                  num_threads=None,
                  allow_op_delay=False,
-                 share_vars_from=None):
+                 share_vars_from=None,
+                 customize_loss_grad=False):
         """
         ParallelExecutor can run program in parallel.
 
@@ -78,7 +79,7 @@ class ParallelExecutor(object):
         else:
             for i in xrange(multiprocessing.cpu_count()):
                 p = core.Place()
-                self._act_places.append(core.CPUPlace(i))
+                self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
                 self._places.append(p)
         assert self._places, "no place for execution"
@@ -122,7 +123,8 @@ class ParallelExecutor(object):
             loss_name if loss_name else '',
             scope,
             local_scopes,
-            allow_op_delay)
+            allow_op_delay,
+            customize_loss_grad)
         self.scope = scope
 
     def run(self, fetch_list, feed=None, feed_dict=None):
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 604c6f9ab36c2332223d1ba943d67113922615b3..c006bd9a66ddb422b7d80d2ca87aa7f56a6485db 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -16,9 +16,8 @@ import framework
 from . import core
 
 __all__ = [
-    'append_regularization_ops',
-    'L1Decay',
-    'L2Decay',
+    'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay',
+    'L1DecayRegularizer', 'L2DecayRegularizer'
 ]
 
 
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 0027b651e88b68950e77e03399b3987aa0120192..db96c82ce2d8376b029e9dcc54ffab669f1def9a 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -244,10 +244,14 @@ def infer(use_cuda, save_dirname=None):
         assert len(results[0]) == len(transpiler_results[0])
         for i in range(len(results[0])):
             np.testing.assert_almost_equal(
-                results[0][i], transpiler_results[0][i], decimal=6)
+                results[0][i], transpiler_results[0][i], decimal=5)
 
         print("infer results: ", results[0])
 
+        fluid.io.save_inference_model(save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
+
 
 def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/text_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..780d05b94667d3ea726e37bf9cf1b5b2baeff354
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/.gitignore
@@ -0,0 +1 @@
+*.recordio
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
new file mode 100644
index 0000000000000000000000000000000000000000..9425d472a48056e71da5da364f659971ef6c2520
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+
+def load_vocab(filename):
+    """
+    load vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    return vocab
+
+
+# load word dict with paddle inner function
+word_dict = load_vocab(sys.argv[1])
+word_dict["<unk>"] = len(word_dict)
+print "Dict dim = ", len(word_dict)
+
+# input text data
+data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
+
+# label data
+label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+# like placeholder
+feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
+
+# train data set
+BATCH_SIZE = 128
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.imdb.train(word_dict), buf_size=10000),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "train.recordio", feeder=feeder, reader_creator=train_reader)
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "test.recordio", feeder=feeder, reader_creator=test_reader)
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e408684c6e0941a1b317ffeac66f071c1382836d
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy
+import sys
+
+TRAIN_FILES = ['train.recordio']
+TEST_FILES = ['test.recordio']
+
+DICT_DIM = 89528
+
+# embedding dim
+emb_dim = 128
+
+# hidden dim
+hid_dim = 128
+
+# hidden dim2
+hid_dim2 = 96
+
+# class num
+class_dim = 2
+
+
+def network_cfg(is_train, pass_num=100):
+    with fluid.unique_name.guard():
+        train_file_obj = fluid.layers.open_files(
+            filenames=TRAIN_FILES,
+            pass_num=pass_num,
+            shapes=[[-1, 1], [-1, 1]],
+            lod_levels=[1, 0],
+            dtypes=['int64', 'int64'],
+            thread_num=1)
+
+        test_file_obj = fluid.layers.open_files(
+            filenames=TEST_FILES,
+            pass_num=1,
+            shapes=[[-1, 1], [-1, 1]],
+            lod_levels=[1, 0],
+            dtypes=['int64', 'int64'],
+            thread_num=1)
+
+        if is_train:
+            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
+        else:
+            file_obj = test_file_obj
+
+        file_obj = fluid.layers.double_buffer(
+            file_obj,
+            name="train_double_buffer" if is_train else 'test_double_buffer')
+
+        data, label = fluid.layers.read_file(file_obj)
+
+        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
+
+        # sequence conv with window size = 3
+        win_size = 3
+        conv_3 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=win_size,
+            act="tanh",
+            pool_type="max")
+
+        # fc layer after conv
+        fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+
+        # probability of each class
+        prediction = fluid.layers.fc(input=[fc_1],
+                                     size=class_dim,
+                                     act="softmax")
+        # cross entropy loss
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+        # mean loss
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        if is_train:
+            # SGD optimizer
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
+            sgd_optimizer.minimize(avg_cost)
+
+        return {
+            'loss': avg_cost,
+            'log': [avg_cost, acc],
+            'file': train_file_obj if is_train else test_file_obj
+        }
+
+
+def main():
+    train = fluid.Program()
+    startup = fluid.Program()
+
+    with fluid.program_guard(train, startup):
+        train_args = network_cfg(is_train=True)
+
+    test = fluid.Program()
+
+    with fluid.program_guard(test, fluid.Program()):
+        test_args = network_cfg(is_train=False)
+
+    # startup
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place=place)
+    exe.run(startup)
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=True, loss_name=train_args['loss'].name, main_program=train)
+
+    fetch_var_list = [var.name for var in train_args['log']]
+    for i in xrange(sys.maxint):
+        result = map(numpy.array,
+                     train_exe.run(fetch_list=fetch_var_list
+                                   if i % 1000 == 0 else []))
+        if len(result) != 0:
+            print 'Train: ', result
+
+        if i % 1000 == 0:
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True, main_program=test, share_vars_from=train_exe)
+            loss = []
+            acc = []
+            try:
+                while True:
+                    loss_np, acc_np = map(
+                        numpy.array, test_exe.run(fetch_list=fetch_var_list))
+                    loss.append(loss_np[0])
+                    acc.append(acc_np[0])
+            except:
+                test_args['file'].reset()
+                print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index c9069777faf9d141db93184e8b1e6dc2a7034980..5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -361,10 +361,7 @@ class TestCeil(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    # The same reason with TestFloor
 
     def init_dtype(self):
         pass
@@ -396,10 +393,8 @@ class TestFloor(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    # the gradient on floor, ceil, round is undefined.
+    # we return zero as gradient, but the numpy return nan 
 
     def init_dtype(self):
         pass
@@ -501,11 +496,6 @@ class TestRound(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
     def init_dtype(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 7ecf9a1459ffc9740ae8c12df3902163ee689f59..6afb6fa6e753d3d6478313c840b158c3895b3efb 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -100,6 +100,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    if data_format != "NCHW" and data_format != "NHWC":
+        raise ValueError("Unknown data order.")
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
@@ -304,7 +307,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             # run backward
             y_grad = np.random.random_sample(shape).astype(np.float32)
             x_grad, scale_grad, bias_grad = _reference_grad(
-                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bffb4f3b666a7ddcc133b7c30fab132b49aa1d0e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def bilinear_interp_np(input, out_h, out_w):
+    batch_size, channel, in_h, in_w = input.shape
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    else:
+        ratio_h = 0.0
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+    else:
+        ratio_w = 0.0
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+    for i in range(out_h):
+        h = int(ratio_h * i)
+        hid = 1 if h < in_h - 1 else 0
+        h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            w = int(ratio_w * j)
+            wid = 1 if w < in_w - 1 else 0
+            w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                              h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                                        w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype("float32")
+
+
+class TestBilinearInterpOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.random(self.input_shape).astype("float32")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
+
+        self.inputs = {'X': input_np}
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+
+
+class TestCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index d5dd63e8737cbdd9b91d083fbd0b38f8baf570b3..7703dfe0135b402f830bcdeaf47c26e5e3f8ca58 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -97,15 +97,18 @@ class TestConv3dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -117,7 +120,7 @@ class TestConv3dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -132,7 +135,7 @@ class TestConv3dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f45381af8ac64d117eb27325f25763fbf6cae7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+class TestElementWiseAddOp(unittest.TestCase):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward_backward(self):
+        def test_with_place(place):
+            out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+            x_grad = out_grad
+            sum_axis = range(0, len(self.x.shape))
+            del sum_axis[self.axis]
+            y_grad = np.sum(out_grad, axis=tuple(sum_axis))
+
+            var_dict = locals()
+            var_dict['y'] = self.y
+            var_dict['x'] = self.x
+            var_dict['out'] = self.out
+            var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['out@GRAD'] = out_grad
+
+            var_names = ['x', 'y', 'out', 'y@GRAD', 'x@GRAD', 'out@GRAD']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                elementwise_add_op = block.append_op(
+                    type="elementwise_add",
+                    inputs={
+                        "X": block.var('x'),
+                        "Y": block.var('y'),
+                    },
+                    outputs={"Out": block.var('out'), },
+                    attrs={"axis": self.axis, })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    elementwise_add_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'y', 'out@GRAD']
+                              },
+                              fetch_list=['x@GRAD', 'y@GRAD'])
+                self.__assert_close(x_grad, out[0], "x@GRAD")
+                self.__assert_close(y_grad, out[1], "y@GRAD", atol=1.4)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "elementwise_add"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        np.random.seed(123)
+        self.x = np.random.random((4, 32, 220, 220)).astype(np.float32)
+        self.y = np.random.random((32)).astype(np.float32)
+        self.out = self.x + self.y.reshape(1, 32, 1, 1)
+        self.axis = 1
+        self.check_forward_backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d68ef376dc4a664a96ff5a24545c1997ee924a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
+
+
+class TestMKLDNNMulOp(TestMulOp):
+    def init_op_test(self):
+        super(TestMKLDNNMulOp, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNMulOp2(TestMulOp2):
+    def init_op_test(self):
+        super(TestMKLDNNMulOp2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNFP16MulOp1(TestFP16MulOp1):
+    def init_op_test(self):
+        super(TestMKLDNNFP16MulOp1, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNFP16MulOp2(TestFP16MulOp2):
+    def init_op_test(self):
+        super(TestMKLDNNFP16MulOp2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 40440bea1267112b84b66002a0bf921be3029265..d984393c89f44f5b9679a22bf7bb6182599233e3 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -21,10 +21,12 @@ from op_test import OpTest
 class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
+        self.use_mkldnn = False
         self.inputs = {
             'X': np.random.random((32, 84)).astype("float32"),
             'Y': np.random.random((84, 100)).astype("float32")
         }
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
@@ -45,11 +47,16 @@ class TestMulOp(OpTest):
 class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
+        self.use_mkldnn = False
         self.inputs = {
             'X': np.random.random((15, 4, 12, 10)).astype("float32"),
             'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
         }
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+            'use_mkldnn': self.use_mkldnn
+        }
         result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
                         self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
         result = result.reshape(15, 4, 8, 2, 9)
@@ -73,9 +80,11 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
+        self.use_mkldnn = False
         x = np.random.random((32, 84)).astype("float16")
         y = np.random.random((84, 100)).astype("float16")
         self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(x, y)}
 
     def test_check_output(self):
@@ -88,12 +97,14 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
+        self.use_mkldnn = False
         x = np.random.random((15, 4, 12, 10)).astype("float16")
         y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
         self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
+            'use_mkldnn': self.use_mkldnn
         }
         result = np.dot(
             x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index 5dc41e54d6158787eb966333c894e378b5c706d0..3f940203b9393d266d75b50c9cbf62e89c36cbdf 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -69,7 +69,6 @@ class TestMultipleReader(unittest.TestCase):
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
-            data_files.reset()
             self.assertEqual(batch_count, self.num_batch * 3)
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 1471843ded7a42432a84a9fad76bb97dcf7fb9c2..52e7cc1ffbba40a63ce3cec645c7c0a7a499c1bf 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -43,9 +43,8 @@ class TestMultipleReader(unittest.TestCase):
                 filename='./mnist.recordio',
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
-            data_file = fluid.layers.io.multi_pass(
-                reader=data_file, pass_num=self.pass_num)
+                dtypes=['float32', 'int64'],
+                pass_num=self.pass_num)
             img, label = fluid.layers.read_file(data_file)
 
             if fluid.core.is_compiled_with_cuda():
@@ -65,5 +64,4 @@ class TestMultipleReader(unittest.TestCase):
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
-            data_file.reset()
             self.assertEqual(batch_count, self.num_batch * self.pass_num)
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 649fabe4a0cdef4c665f8a6d3ebee1bb8232185f..779ae388f04496a7be9a6d5aa4e39b8245022925 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -62,7 +62,8 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output_names, ["Out"])
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
-            set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"]))
+            set(mul_op.attr_names),
+            set(["x_num_col_dims", "y_num_col_dims", "use_mkldnn"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 328a9ffd25b9fce3fd45bbe847e365f090acd17c..f7e1e8573290766cde0c35816d687e7ba6fa4220 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -109,8 +109,11 @@ class TestPool2d_Op(OpTest):
 
         self.outputs = {'Out': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -119,7 +122,7 @@ class TestPool2d_Op(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 15a8ac5e2029eec204d061d1832df3df90339697..aaa94842513691c836e04353aa4bc5ce5e66c5c3 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -118,15 +118,18 @@ class TestPool3d_Op(OpTest):
 
         self.outputs = {'Out': output.astype('float32')}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 7c8e7f634fdd3ee3f056a95df774402a7c29e906..f32050014d7ace5aee4aca75a47bfc6a75ff91c2 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -74,13 +74,13 @@ class TestRecordIO(unittest.TestCase):
 
                 avg_loss_np.append(tmp)
                 batch_id += 1
-            data_file.reset()
             self.assertEqual(batch_id, self.num_batches)
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
 
     def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(
+            reader, buffer_size=200))
 
     def test_double_buffer_reader(self):
         self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
-                                                                                                  place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
+                                                                                       place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 887bdfe8b3608878bace5b857a71ada123b74b2f..eb49a53e54f4bdb6bcd6cb1991423970f29997bb 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,7 +19,7 @@ from op_test import OpTest
 
 class TestSplitOp(OpTest):
     def setUp(self):
-        self.op_type = "split"
+        self._set_op_type()
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)
@@ -28,6 +28,9 @@ class TestSplitOp(OpTest):
         self.outputs = {'Out': [('out%d' % i, out[i]) \
             for i in xrange(len(out))]}
 
+    def _set_op_type(self):
+        self.op_type = "split"
+
     def test_check_output(self):
         self.check_output()
 
@@ -35,5 +38,10 @@ class TestSplitOp(OpTest):
         self.check_grad(['X'], ['out0', 'out1', 'out2'])
 
 
+class TestSplitByrefOp(OpTest):
+    def _set_op_type(self):
+        self.op_type = "split_byref"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 37c4296f9bcea7e16daa46f778934331513c30c4..00c2a3b9928d1ca5f3e8cd5e87ba7ad4108e9dad 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -124,7 +124,7 @@ def test(word_idx):
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
-def word_dict():
+def word_dict(cutoff=150):
     """
     Build a word dictionary from the corpus.
 
@@ -132,7 +132,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
 
 
 def fetch():
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
index 837fcbb8512bce027ecd09a7f39b806151e9154b..4fdd4b0de44e779378091566d9d6056a6f9ee4b6 100644
--- a/tools/aws_benchmarking/README.md
+++ b/tools/aws_benchmarking/README.md
@@ -77,14 +77,15 @@ Training nodes will run your `ENTRYPOINT` script with the following environment
 Now let's start the training process:
 
 ```bash
-docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pare name>.pem \
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pair name>.pem \
 putcn/paddle_aws_client \
 --action create \
---key_name <your key pare name> \
+--key_name <your key pair name> \
 --security_group_id <your security group id> \
 --docker_image myreponame/paddle_benchmark \
 --pserver_count 2 \
---trainer_count 2
+--trainer_count 2 \
+--trainer_command batch_size:20,local:no,device:CPU
 ```
 
 Now just wait until you see this:
@@ -153,8 +154,31 @@ Master exposes 4 major services:
 
 ### Parameters
 
-TBD, please refer to client/cluster_launcher.py for now
+ - key_name: required, aws key pair name
+ - security_group_id: required, the security group id associated with your VPC
+ - vpc_id: The VPC in which you wish to run test, if not provided, this tool will use your default VPC.
+ - subnet_id: The Subnet_id in which you wish to run test, if not provided, this tool will create a new sub net to run test.
+ - pserver_instance_type: your pserver instance type, c5.2xlarge by default, which is a memory optimized machine.
+ - trainer_instance_type: your trainer instance type, p2.8xlarge by default, which is a GPU machine with 8 cards.
+ - task_name: the name you want to identify your job, if not provided, this tool will generate one for you.
+ - pserver_image_id: ami id for system image. Please note, although the default one has nvidia-docker installed, pserver is always launched with `docker` instead of `nvidia-docker`, please DO NOT init your training program with GPU place.
+ - pserver_command: pserver start command, format example: python,vgg.py,batch_size:128,is_local:no, which will be translated as `python vgg.py --batch_size 128 --is_local no` when trying to start the training in pserver. "--device CPU" is passed as default.
+ - trainer_image_id: ami id for system image, default one has nvidia-docker ready.
+ - trainer_command: trainer start command. Format is the same as pserver's, "--device GPU" is passed as default.
+ - availability_zone: aws zone id to place ec2 instances, us-east-2a by default.
+ - trainer_count: Trainer count, 1 by default.
+ - pserver_count: Pserver count, 1 by default.
+ - action: create|cleanup|status, "create" by default.
+ - pserver_port: the port for pserver to open service, 5436 by default.
+ - docker_image: the training docker image id.
+ - master_service_port: the port for master to open service, 5436 by default.
+ - master_server_public_ip: the master service ip, this is required when action is not "create"
+ - master_docker_image: master's docker image id, "putcn/paddle_aws_master:latest" by default
+ - no_clean_up: no instance termination when training is finished or failed when this value is set "yes". This is for debug purpose, so that you can inspect into the instances when the process is finished. 
+ 
 
 ### Trouble shooting
 
-TBD
+ 1. How to check logs
+
+    Master log is served at `http://<masterip>:<masterport>/status`, and you can list all the log files from `http://<masterip>:<masterport>/logs`, and access either one of them by `http://<masterip>:<masterport>/log/<logfilename>`
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
index 594378ff8fc0744a4b11b1c11e2e3b270be7aed0..12333202b9f003ae5109c7e9b825035ba8eb7d99 100644
--- a/tools/aws_benchmarking/client/cluster_launcher.py
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -80,7 +80,11 @@ parser.add_argument(
     use ami-1ae93962 for us-east-2")
 
 parser.add_argument(
-    '--pserver_command', type=str, default="", help="pserver start command")
+    '--pserver_command',
+    type=str,
+    default="",
+    help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
 
 parser.add_argument(
     '--trainer_image_id',
@@ -90,7 +94,11 @@ parser.add_argument(
     use ami-1ae93962 for us-west-2")
 
 parser.add_argument(
-    '--trainer_command', type=str, default="", help="trainer start command")
+    '--trainer_command',
+    type=str,
+    default="",
+    help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
 
 parser.add_argument(
     '--availability_zone',
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
index 21f85a5fc43e951897eb6b785367630abda722c0..7952e61159ec31a4be5394b50f30cbc20f9b414e 100644
--- a/tools/aws_benchmarking/server/cluster_master.py
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -19,6 +19,7 @@ import math
 import time
 import threading
 import logging
+import copy
 
 import netaddr
 import boto3
@@ -257,6 +258,8 @@ def script_to_str(file_path):
 
 
 def run_instances(image_id, instance_type, count, role, cmd=""):
+    if count == 0:
+        return []
     response = ec2client.run_instances(
         ImageId=image_id,
         InstanceType=instance_type,
@@ -334,6 +337,22 @@ def log_to_file(source, filename):
             log_file.write(line)
 
 
+def parse_command(command_raw, defaults={}):
+    if not command_raw:
+        command_raw = ""
+    commands_processed = []
+    parameter_map = copy.copy(defaults)
+    for seg in command_raw.split(","):
+        if ":" in seg:
+            parameters = seg.split(":")
+            parameter_map[parameters[0]] = parameters[1]
+        else:
+            commands_processed.append(seg)
+    for key, val in parameter_map.iteritems():
+        commands_processed.append("--" + key + " " + str(val))
+    return " ".join(commands_processed)
+
+
 def create_trainers(kickoff_cmd, pserver_endpoints_str):
     def create_and_start_trainer(trainer_index):
         logging.info("trainer " + str(trainer_index) + " is starting")
@@ -361,7 +380,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str):
             TRAINER_INDEX=str(trainer_index),
             TASK_NAME=args.task_name,
             TRAINER_COUNT=args.trainer_count,
-            COMMAND=args.trainer_command,
+            COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
             MASTER_ENDPOINT=args.master_server_ip + ":" +
             str(args.master_server_port))
         logging.info(cmd)
@@ -476,7 +495,7 @@ def kickoff_pserver(host, pserver_endpoints_str):
             DOCKER_IMAGE=args.docker_image,
             PSERVER_PORT=args.pserver_port,
             TASK_NAME=args.task_name,
-            COMMAND=args.pserver_command,
+            COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
             TRAINER_COUNT=args.trainer_count,
             TRAINER_INDEX=0,
             # there is no way to use 0.0.0.0:port to start pserver
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
index 2612856d1e6273fe2642f82e8c616eb9ff24f8a4..8d7f9e84c768b096537c92a448a117d91903f25b 100644
--- a/tools/aws_benchmarking/server/pserver.sh.template
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -1,2 +1,2 @@
 #!/bin/bash
-docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
index a4b2876b08cdf05e90e50589f897d74ca5f90443..9b0aae9f7a7a879f164b380f719065302e0eb7e2 100644
--- a/tools/aws_benchmarking/server/trainer.sh.template
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -1,2 +1,2 @@
 #!/bin/bash 
-nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file