diff --git a/CMakeLists.txt b/CMakeLists.txt
index de47086dbd6a440cd413c7843c83b1c69d9841b2..23bbe829ac16180088bfa37df66e23f19b021ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,6 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_TENSORRT    "Compile PaddlePaddle with TensorRT support."   OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -180,13 +179,9 @@ set(EXTERNAL_LIBS
 
 if(WITH_GPU)
     include(cuda)
+    include(tensorrt)
 endif(WITH_GPU)
 
-# TensorRT depends on GPU.
-if (NOT WITH_GPU)
-  set(WITH_TENSORRT OFF)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/Dockerfile b/Dockerfile
index 9097bb657d2366997112ec7662762a93358aa647..9ac58f37f2893613ca9f82be08136d9da674737e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,8 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
+
+# When you modify it, please be aware of cudnn-runtime version 
+# and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
@@ -46,7 +49,7 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 RUN curl -s -q https://glide.sh/get | sh
 
 # Install TensorRT
-# The unnecessary files has been removed to make the library small.
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b4b74dd1e4f4d0901dd81c91a7decd1..848a7eba6f1421432addae8acff407b611adb4ae 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
diff --git a/paddle/scripts/check_env.sh b/benchmark/paddle/image/check_env.sh
similarity index 100%
rename from paddle/scripts/check_env.sh
rename to benchmark/paddle/image/check_env.sh
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f726405c4773994f6ca6509e5218750805b03995..e490397cc0624c310949a4b571bd00cac6e8953b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -80,6 +80,16 @@ if(WITH_GPU)
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+    if(TENSORRT_FOUND)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        include_directories(${TENSORRT_INCLUDE_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0c07d36bed65400164853b99f18ec0335341cd94
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+    set(TENSORRT_FOUND ON)
+else()
+    set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+        "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
index ae9daeb7918d773d7330f419de96c6972a836710..f80b87c7d2704a144c02028c4925530a67d11289 100644
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,17 +5,24 @@
 evaluator
 =========
 
-Accuracy
---------
+ChunkEvaluator
+--------------
 
-..  autoclass:: paddle.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
     :members:
     :noindex:
 
-ChunkEvaluator
+EditDistance
 --------------
 
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.EditDistance
     :members:
     :noindex:
 
+DetectionMAP
+--------------
+
+..  autoclass:: paddle.fluid.evaluator.DetectionMAP
+    :members:
+    :noindex:
+  
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index ee69925fda6b3fc850cfb632e8edd359e7fcff9c..2f02c5de097945a45a3e053427104bd17bea1279 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,3 +33,44 @@ Xavier
     :members:
     :noindex:
 
+MSRA
+------
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+
+
+MSRAInitializer
+-----------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 5c02886efd7d11e9520910526fb90ec01e123bae..3790f09c84563fe541bd8d0bc08e23b19d4287ca 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -815,3 +815,8 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 2f820595c35c2bccd6a5c8a20c60d796c04c8e97..7a92caf9b7139cf091eff834dbed3586b23ac3af 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,10 +47,51 @@ DecayedAdagrad
     :members:
     :noindex:
 
+SGDOptimizer
+------------
+
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+    :members:
+    :noindex:
+
+MomentumOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+    :members:
+    :noindex:
+
+AdagradOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+
+AdamOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+
+AdamaxOptimizer
+---------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+
+DecayedAdagradOptimizer
+-----------------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+    :members:
+    :noindex:
+
 Adadelta
 --------------
 
 ..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
     :members:
     :noindex:
-
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index dc9740c46392567d314121ac401540b0e7382703..837c67111c6e98e6a3859be802addc20a1c64f2b 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -25,3 +25,16 @@ L2Decay
     :members:
     :noindex:
 
+L1DecayRegularizer
+---------------------
+
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+    :members:
+    :noindex:
+
+L2DecayRegularizer
+---------------------
+
+..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+    :members:
+    :noindex:
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 940d37fb31dcd0c50ea6c4c42b052d7cb23a9c47..340bc302d57429a9bf10a9d23ed9b0cdc7a2a568 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -49,9 +49,9 @@ In the new design, we propose to create a new operation for averaging parameter
 - the optimizer
 - the window_size to keep the updates
 
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
 
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 
 ### Python API implementation for ParameterAverageOptimizer
 
@@ -59,8 +59,8 @@ Based on Polyak and Juditsky (1992), we can generalize the averaging of updates
 - Any optimizer (RMSProp , AdaGrad etc.)
 - A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
 
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
 
 #### Creation of the ParameterAverageOptimizer operator
 There are two ways for creating the ParameterAverageOptimizer op:
@@ -71,4 +71,4 @@ The proposal is to add the op immediately while building the computation graph.
 
 #### High-level API
 
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 3b626bd89cd83a9428997abccfeeebbbbdbb3d38..3757cd055c818be1e63ee8c0f000f4dd299b59f4 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -113,7 +113,7 @@ if (cond) {
 
 ```
 
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
 
 ```python
 import paddle as pd
@@ -140,7 +140,7 @@ The difference is that variables in the C++ program contain scalar values, where
 
 ### Blocks with `for` and `RNNOp`
 
-The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
 
 ```python
 x = sequence([10, 20, 30]) # shape=[None, 1]
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
index 2d4b371cc56db82ce5747da6db07f05aa7f7e6c1..3fcddf4dd90f826ee1a16713f4371fb010f8eac5 100644
--- a/doc/fluid/design/concepts/executor.md
+++ b/doc/fluid/design/concepts/executor.md
@@ -1,7 +1,7 @@
 # Executor Design Doc
 
 ## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
 [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
 The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
index bd2456787c4e336d357a65255a8274a7c9e465cc..cfcd21ecdb9d2844bf93ed98a56db09651077c40 100644
--- a/doc/fluid/design/concepts/program.md
+++ b/doc/fluid/design/concepts/program.md
@@ -4,7 +4,7 @@
 
 A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
 
-A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
 
 ```python
 x = layer.data("images")
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index 1859f983e9133674e69ecd506d7683ea926b2b8f..0428e74f9e00a87f6b0972057f48479b8ae56ad6 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -1,6 +1,6 @@
 # Design Doc: Concurrent Programming with Fluid
 
-With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
 
 Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
 
@@ -28,19 +28,19 @@ The following table compares concepts in Fluid and Go
 <tr>
 <td>control-flow and built-in functions </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
 <td></td>
 </tr>
 <tr>
 <td>goroutines, channels </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
 <td></td>
 </tr>
 <tr>
 <td>runtime </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
 <td></td>
 </tr>
 </tbody>
@@ -78,7 +78,7 @@ message ProgramDesc {
 }
 ```
 
-Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
 
 The default `main` function is defined as follows:
 
@@ -146,7 +146,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
@@ -175,7 +175,7 @@ where
   1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
   2. once a connection is established,
      1. creates a scope of two parameters, "input" and "output",
-     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
      3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
 
 ## Summarization
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index 229cb47c17d633be6848bb35e58d33ec9b47ec3b..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -177,7 +177,7 @@ The local training architecture will be the same as the distributed training arc
 ### Training Data
 
 In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
+with [data reader](./README.md) from Python. This approach is
 no longer efficient when training distributedly since the Python
 process no longer runs on the same node with the trainer processes,
 the Python reader will need to read from the distributed filesystem
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 73c85da5e89eee0ac7857a0b808bc64ae673fdad..563b70bc0e852bec953eb40dda3c46b3d45d7e68 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -65,7 +65,7 @@ For embedding layers, the gradient may have many rows containing only 0 when tra
 if the gradient uses a dense tensor to do parameter optimization,
 it could spend unnecessary memory, slow down the calculations and waste
 the bandwidth while doing distributed training.
-In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 7b61b050f640814d6949cf6847b431da53d59581..b39ae0675c45e56852293d97f45e91861cf31667 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
 
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
 </p>
 
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 
 
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
index e9f55214f411abb11bef180d7af4716ad85a0b09..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914 100644
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
@@ -9,7 +9,7 @@
   concepts/index_cn.rst
   data_type/index_cn.rst
   memory/index_cn.rst
-  muti_devices/index_cn.rst
+  multi_devices/index_cn.rst
   dynamic_rnn/index_cn.rst
   concurrent/index_cn.rst
   algorithm/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
index 2802dc3a31d540c5a19bf9042053496aad152f98..2bfee02ad4626633b08ddff747e2886faf9ba99f 100644
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
@@ -9,7 +9,7 @@ Design
   concepts/index_en.rst
   data_type/index_en.rst
   memory/index_en.rst
-  muti_devices/index_en.rst
+  multi_devices/index_en.rst
   dynamic_rnn/index_en.rst
   concurrent/index_en.rst
   algorithm/index_en.rst
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b..265732a348ea77d21005e335390d99abcdfbd045 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -36,7 +36,7 @@ Please be aware that these Python classes need to maintain some construction-tim
 
 ### Program
 
-A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
 
 Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
 
@@ -70,7 +70,7 @@ class Program(objects):
 
 ### Block
 
-A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
 
 1. a map from variable names to an instance of the Python `Variable` class, and
 1. a list of `Operator` instances.
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
index 8cd5ff71d193f03e1ac923724b52f28c6057d25d..519a9143033386678351ff78a465e5ba6e220c52 100644
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -32,9 +32,9 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op
 
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
 
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 
 ### Computation Graph
 
@@ -48,7 +48,7 @@ The Python API will modify this computation graph to add regularization operator
    
 ### Python API implementation for Regularization
 
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
 
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
@@ -63,4 +63,4 @@ Since we want to create the regularization ops in a lazy manner, the regularizat
 
 #### High-level API
 
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
index 2a6beafc52e815fa067b273bb5887ddcf6ab15ae..6dd3840a0734e8593890dcf8044746197350c6f5 100644
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -23,7 +23,7 @@ func paddlepaddle() {
 }
 ```
 
-This program consists of a [block](block.md) of three operators --
+This program consists of a [block](../concepts/block.md) of three operators --
 `read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
 the following
 
@@ -39,7 +39,7 @@ message ProgramDesc {
   }
 }
 ```
- 
+
 ## Transpilers
 
 We can write a transpiler program that takes a `ProgramDesc`, e.g.,
@@ -93,7 +93,7 @@ specific hardware platform, for example, the `mult` operator, the
 generated code should call its CUDA kernel:
 
 ```c++
-paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
                                const paddle::Tensor& b) {
   paddle::Tensor t;
   paddle::operator::Mult m(a, b, ...);
@@ -107,4 +107,4 @@ where `cuda_context` could be a global variable of type
 ## Multi-Block Code Generation
 
 Most Fluid application programs may have more than one blocks.  To
-execute them, we need to trace [scopes](scope.md).
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index f199cc892f5e84f0a12abe3b8e5cace9849e7fa8..4e1d660cef6369f04db8e1e83360f6af25259f96 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -11,7 +11,7 @@ The goals of refactoring include:
 
 1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
 
-  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
 
 1. Users write Python programs to describe the graphs and run them (locally or remotely).
 
@@ -28,7 +28,7 @@ The goals of refactoring include:
       1. the C++ library `libpaddle.so` for local execution,
       1. the master process of a distributed training job for training, or
       1. the server process of a Kubernetes serving job for distributed serving.
-   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
 
 ## Description and Realization of Computation Graph
 
@@ -48,16 +48,16 @@ At runtime, the C++ program realizes the graph and runs it.
 <tr>
 <td>Data</td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
 </tr>
 <tr>
 <td>Operation </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
 </tr>
 <tr>
 <td>Block </td>
@@ -85,7 +85,7 @@ The word *graph* is interchangeable with *block* in this document.  A graph cons
 
 1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
 
-   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
       1. realize local variables defined in the BlockDesc message in the new scope,
       1. a scope is similar to the stack frame in programming languages,
 
@@ -195,7 +195,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 ## Related Concepts
 
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
 
 ### Register Macros
 ```cpp
@@ -236,7 +236,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
 	* All operations on `Tensor` are written in `Operator` or global functions.
-	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
 * `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
 	* `step_scopes` in RNN is a variable and not a tensor.
 * `Scope` is where variables are stored.
diff --git a/doc/fluid/design/muti_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
similarity index 100%
rename from doc/fluid/design/muti_devices/index_cn.rst
rename to doc/fluid/design/multi_devices/index_cn.rst
diff --git a/doc/fluid/design/muti_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
similarity index 100%
rename from doc/fluid/design/muti_devices/index_en.rst
rename to doc/fluid/design/multi_devices/index_en.rst
diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
similarity index 80%
rename from doc/fluid/design/muti_devices/kernel_hint_design.md
rename to doc/fluid/design/multi_devices/kernel_hint_design.md
index 58e44b64169d8c942174de86986403570b271641..6edc14ca73b1abf824981b59511a9aca4e0f3b47 100644
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -1,7 +1,7 @@
 # Kernel Hint Design
 
 ## Problem
-In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 
 In the current design, we use KernelType to describe one kernel.
 
@@ -14,7 +14,7 @@ struct KernelType {
 ```
  `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
 
-The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
 
 So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
 
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
similarity index 100%
rename from doc/fluid/design/muti_devices/kernel_selection.md
rename to doc/fluid/design/multi_devices/kernel_selection.md
diff --git a/doc/fluid/design/muti_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
similarity index 97%
rename from doc/fluid/design/muti_devices/operator_kernel_type.md
rename to doc/fluid/design/multi_devices/operator_kernel_type.md
index f86e6b7a564ed23f2bddbec25da1c110014f941d..8c1bc8f76a337006497e5ab5e5a710f9f49261b8 100644
--- a/doc/fluid/design/muti_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -8,7 +8,7 @@ struct OpKernelType {
   proto::DataType data_type_;
 };
 ```
-For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
 
 It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
 
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
index f13d30ca9fe09c9525c711436f605bb280e11000..b95773c50ca0dcbd1b93529332e035d4de90faa8 100644
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -11,7 +11,7 @@ In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` imp
 
 There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
 
-During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
 
 For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..955216ca62e71b4d3666e1662aa86c9495d2e7d6
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f9fc68c37e17a8a365b0d7fae86c16b0d094631f
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index ad798003f560e7fb0e6db6083fdd152fd3417584..37e608160db0ad5a92297987937bbbfa8f842ea8 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,8 @@
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
   api_doc_std_cn.md
   new_op_cn.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 80c899a82fa452c5cd8f38dad89c15d3041b09e3..d7f83035010f13c30514673ecbee301f194dc175 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,8 @@ Development
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_en.md
+  write_docs_en.md
   api_doc_std_en.md
   new_op_en.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
index 75830ef28c67dc4694d899efe503084b7b5852e1..6b4244d0f506c8cd6c08739141eabad27c581ca7 100644
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -4,7 +4,7 @@ To make the operator document itself more clear, we recommend operator names obe
 
 ## OpProtoMaker names
 
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
 
 - Input/Output.
   - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 0c3f88d9c31e05bec399c64bf6ade56e62e01f68..587d819f79fcf82549826359fbf04ad3af404446 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -54,10 +54,10 @@
 </table>
 
 
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
 
 
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
 ## 实现C++类
@@ -85,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
 
 
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
 
 ```cpp
 template <typename AttrType>
@@ -103,21 +103,21 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
-    AddComment(R"DOC(Scale operator
-The equation is: Out = scale*X
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
   }
 };
 ```
 
-这个例子有两处不同：
-
-- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
 ### 定义Operator类
@@ -147,7 +147,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -173,7 +173,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -201,10 +201,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
@@ -215,7 +214,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
                   ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -223,8 +224,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
    在上面的代码中：
 
-    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
-    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
@@ -255,7 +255,7 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
 
 ### 前向Operator单测
 
@@ -315,7 +315,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 
 ### 编译和执行
 
-`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
 请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
@@ -331,7 +331,6 @@ ctest -R test_mul_op
 
 ## 注意事项
 
-- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
-- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index a566a09131f86251b70d5435d0a483aa2a705b35..f8de271ed4e5e0fb4018478bffd4b525d4319738 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -26,13 +26,6 @@ Here are the base types needed. For details, please refer to the design docs.
 Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 
 
- Information           | Where is it defined
---------------  | :----------------------
-OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
-Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
-
 <table>
 <thead>
 <tr>
@@ -61,10 +54,10 @@ Registering the Op           | Ops are registered in `.cc` files; For Kernel reg
 </table>
 
 
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
 
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 
 
 ## Implementing C++ Types
@@ -92,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
 
    - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
    - `framework::OpAttrChecker` is used to validate variable attributes.
 
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
 
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
 
 
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
 
 ```cpp
 template <typename AttrType>
@@ -120,11 +113,7 @@ The equation is: Out = scale*X
 };
 ```
 
-There are two changes in this example:
-
-- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
 ### Defining Operator
@@ -154,7 +143,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -180,7 +169,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -209,9 +198,9 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
 
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
 
 
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@@ -224,7 +213,9 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
 
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
@@ -233,9 +224,8 @@ The definition of its corresponding backward operator, if applicable, is similar
 
    In that code block,
 
-    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
@@ -275,7 +265,7 @@ Unit tests for an operator include
 
 3. a scaling test for the backward operator.
 
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
 
 ### Testing Forward Operators
 
@@ -339,7 +329,7 @@ Some key points in checking gradient above include:
 ### Compiling and Running
 
 
-Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
 
 Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
 
@@ -357,7 +347,6 @@ ctest -R test_mul_op
 
 ## Remarks
 
-- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
-- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
 - If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
index 55dea8d0a39232ede59d4663d6e1a47fbfc60853..87e617d44041bde9c9051151878ffb4304689b3c 100644
--- a/doc/fluid/dev/new_op_kernel.md
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -4,13 +4,13 @@
 
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
 
-[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
 
 ## Write Kernels for A New Device
 
 ### Add A New Device
 
-  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
 
 To register a new device, we need to add an enum value to `LibraryType`:
 
@@ -23,9 +23,9 @@ enum class LibraryType {
 ```
 
 
-### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
 
-If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
 
 ```cpp
 struct CUDAPlace {
@@ -45,8 +45,8 @@ struct CUDAPlace {
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 ```
 
-### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
-After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
 
 ```cpp
 class DeviceContext {
@@ -58,9 +58,9 @@ class DeviceContext {
 };
 ```
 
-### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
 
-A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
 
 ```cpp
 class OpKernelBase {
@@ -101,7 +101,7 @@ REGISTER_OP_KERNEL(
 
 kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
 
-take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
 
 	```cpp
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
index 8983df900460127fc130043c52373dab505363ba..051a463cfcf97df2e2d5b6a880923ca70fefbd6e 100644
--- a/doc/fluid/dev/support_new_device.md
+++ b/doc/fluid/dev/support_new_device.md
@@ -13,7 +13,7 @@ So, how to support a new Device/Library in Fluid becomes a challenge.
 
 ## Basic: Integrate A New Device/Library
 
-For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
 
 There are mainly three parts that we have to consider while integrating a new device/library:
 
@@ -28,7 +28,7 @@ There are mainly three parts that we have to consider while integrating a new de
 Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
 
 ```
         |   CPUPlace
@@ -44,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
 
 #### DeviceContext
 
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
 
 
 ```
@@ -73,7 +73,7 @@ class CUDADeviceContext : public DeviceContext {
   Place GetPlace() const override { return place_; }
 private:
   CUDAPlace place_;
-  cudaStream_t stream_; 
+  cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
 };
@@ -84,7 +84,7 @@ private:
 
 #### memory module
 
-Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
 
 ```
 template <typename Place>
@@ -102,7 +102,7 @@ To implement these interfaces, we have to implement MemoryAllocator for differen
 
 #### Tensor
 
-[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
 
 ```cpp
 class Tensor {
@@ -161,7 +161,7 @@ t.mutable_data(place);
 
 Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
 
-Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
 
 The interface is defined in the header file.
 
@@ -210,7 +210,7 @@ The implementation of `OpKernel` is similar to math functors, the extra thing we
 Fluid provides different register interfaces in op_registry.h
 
 
-Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
 
 In .cc file:
 
@@ -236,5 +236,5 @@ Generally, we will implement OpKernel for all Device/Library of an Operator. We
 
 For more details, please refer to following docs:
 
-- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
-- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..2c281eaaf43bbfad84c3be9ed1d1bd0dbc77fa9b
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..cb2b9b0ff1f1d9e0e5201d160f6b7d9d451374e2
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst
index 2ccfec9c284877a7576e9751526b169a4ac78d8e..d7c896a6270b488ca4449e5211d0d0879eda6ac5 100644
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@@ -6,7 +6,43 @@ Data Reader Interface
 DataTypes
 =========
 
-..  automodule:: paddle.v2.data_type
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
     :members:
     :noindex:
 
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
deleted file mode 120000
index c44cd9a731bed7067cdf19aa2f714abdce6c736a..0000000000000000000000000000000000000000
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc753aa42f19631c49a451a797f28365e65ed1d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index b05b66415fbb829f471b1491b9881f65137bfe17..67c7b774e9c476a3035037a421c84ebf17a31b09 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index e5aa05c117393e81c557ba67609f787b38587efd..ae997f0805db5b01a34867c9e8b188c931721920 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -1,4 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
 API comparision between RNN and hierarchical RNN
-================================================
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
 
-TBD
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1f3ca24df16cf080d325fbdc0d613a828e384b2a..340b891e41671df7e61a4a66ec538d4603bb9842 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index b8847e4b909cbab67b2ddb6885b45b73d402de19..9f753478d8ecf12441d4b1745a9f6750a1038e31 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
     return;
   }
+  need_update_ = true;
   ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index e98e9d94bf71fe9ac226ab3ad7f587b37a5c6e33..bbf67f5ba92150f70cf45d49e3f4ca0a16393541 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -40,10 +40,10 @@ namespace paddle {
 namespace framework {
 
 template <typename T>
-LoDTensor *CreateVariable(Scope &scope, p::CPUPlace &place, std::string name,
-                          T value) {
+LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
+                          std::string name, T value) {
   // Create LoDTensor<int> of dim [1]
-  auto var = scope.Var(name);
+  auto var = scope->Var(name);
   auto tensor = var->GetMutable<LoDTensor>();
   tensor->Resize({1});
   T *expect = tensor->mutable_data<T>(place);
@@ -77,9 +77,9 @@ void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
   BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
   func(caseBlock, scope);
 
-  CreateVariable(*scope, *place, caseCondName, false);
-  CreateVariable(*scope, *place, caseCondXVarName, caseId);
-  CreateVariable(*scope, *place, caseVarName, caseId);
+  CreateVariable(scope, *place, caseCondName, false);
+  CreateVariable(scope, *place, caseCondXVarName, caseId);
+  CreateVariable(scope, *place, caseVarName, caseId);
 
   scope->Var("step_scope");
 
@@ -96,21 +96,21 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
                         std::string quitChanName) {
   BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
 
-  CreateVariable(*scope, *place, "whileExitCond", true);
-  CreateVariable(*scope, *place, "caseToExecute", -1);
-  CreateVariable(*scope, *place, "case1var", 0);
+  CreateVariable(scope, *place, "whileExitCond", true);
+  CreateVariable(scope, *place, "caseToExecute", -1);
+  CreateVariable(scope, *place, "case1var", 0);
 
-  CreateVariable(*scope, *place, "xtemp", 0);
+  CreateVariable(scope, *place, "xtemp", 0);
 
   // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
   // data,
   // which causes the data to be no longer accessible to do the fib calculation
   // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(*scope, *place, "fibXToSend", 0);
+  CreateVariable(scope, *place, "fibXToSend", 0);
 
-  CreateVariable(*scope, *place, "fibX", 0);
-  CreateVariable(*scope, *place, "fibY", 1);
-  CreateVariable(*scope, *place, "quitVar", 0);
+  CreateVariable(scope, *place, "fibX", 0);
+  CreateVariable(scope, *place, "fibY", 1);
+  CreateVariable(scope, *place, "quitVar", 0);
 
   BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
   std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
@@ -138,7 +138,7 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
     // Exit the while loop after we receive from quit channel.
     // We assign a false to "whileExitCond" variable, which will
     // break out of while_op loop
-    CreateVariable(*scope, *place, "whileFalse", false);
+    CreateVariable(scope, *place, "whileFalse", false);
     AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
           caseBlock);
   };
@@ -174,9 +174,9 @@ TEST(Concurrency, Go_Op) {
 
   // Create Variables, x0 will be put into channel,
   // result will be pulled from channel
-  CreateVariable(scope, place, "Status", false);
-  CreateVariable(scope, place, "x0", 99);
-  CreateVariable(scope, place, "result", 0);
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "x0", 99);
+  CreateVariable(&scope, place, "result", 0);
 
   framework::Executor executor(place);
   ProgramDesc program;
@@ -226,9 +226,9 @@ TEST(Concurrency, Select) {
   // Initialize scope variables
   p::CPUDeviceContext ctx(place);
 
-  CreateVariable(scope, place, "Status", false);
-  CreateVariable(scope, place, "result", 0);
-  CreateVariable(scope, place, "currentXFib", 0);
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "result", 0);
+  CreateVariable(&scope, place, "currentXFib", 0);
 
   framework::Executor executor(place);
   ProgramDesc program;
@@ -246,7 +246,7 @@ TEST(Concurrency, Select) {
         {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
 
   // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(scope, place, "xReceiveVar", 0);
+  CreateVariable(&scope, place, "xReceiveVar", 0);
 
   BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
   for (int i = 0; i < 10; ++i) {
@@ -264,7 +264,7 @@ TEST(Concurrency, Select) {
           goOpBlock);
   }
 
-  CreateVariable(scope, place, "quitSignal", 0);
+  CreateVariable(&scope, place, "quitSignal", 0);
   AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
         {{"Status", {"Status"}}}, {}, goOpBlock);
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a66525303da58601f85c40c41854edaf22c3d4ea..df4caa45eba2470f7528d2fbd99cca39cae0b596 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
 }
 
 TEST(Operator, CPUtoGPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  InitDevices(true);
+  paddle::framework::InitDevices(true);
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
@@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
 
   auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
   // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
-  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
+  auto* src_ptr =
+      in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
   for (int i = 0; i < 2 * 3; ++i) {
     src_ptr[i] = static_cast<float>(i);
   }
@@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
   auto* output = scope.Var("OUT1");
   cpu_op->Run(scope, cpu_place);
 
-  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
   for (int i = 0; i < 2 * 3; ++i) {
     ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
   }
@@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
   VLOG(3) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
   auto dev_ctx = pool.Get(cuda_place);
 
   paddle::framework::Tensor output_tensor;
-  TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
-             &output_tensor);
+  paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
+                                paddle::platform::CPUPlace(), *dev_ctx,
+                                &output_tensor);
 
   dev_ctx->Wait();
   float* output2_ptr = output_tensor.data<float>();
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 39222fc4ed6656dac4773c0c8829608bb954b4c6..9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cctype>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 4ca447d50a7262f44e5feb3739dce653604a6ed8..60ec60a427ba9046ce690eb75c27cd322fdd726d 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index ba15be9fc77b8405cb4bbca3f62a8be44a3f604e..06b638663dd334837a3bcb7737e507fcbc871c7a 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index dd17cac0e10db0d058d399cc725e18dcb14be507..a0d08826b854fea9256382f0e065fd59dda8c8b3 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -18,27 +18,28 @@
 #include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-  Tensor in = Tensor();
-  Tensor out = Tensor();
-  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
-  in.set_layout(DataLayout::kNHWC);
-
-  auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNHWC, LibraryType::kPlain);
-  auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNCHW, LibraryType::kPlain);
-
-  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-
-  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  paddle::framework::Tensor out = paddle::framework::Tensor();
+  in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(paddle::framework::DataLayout::kNHWC);
+
+  auto kernel_nhwc = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNHWC,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_ncwh = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kPlain);
+
+  paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
 
   TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
 
-  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index e75da2588d07a754783f052173c3e0dce118f1b8..1c281b03ed61ac70e16a43d75a79854bdafd8836 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index 6b9a8f5e28b372c45abfaa2c20575a55d9a9dd03..bbebea9f13fd37469a0e9b7be9719aca128f5687 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -17,43 +17,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, CPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto place = paddle::platform::CPUPlace();
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
       ptr[i] = i / 3;
     }
 
-    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
     float* out_data_float = out.data<float>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
     int64_t* out_data_int64 = out.data<int64_t>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
     bool* out_data_bool = out.data<bool>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TransDataType(kernel_fp32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TransDataType(kernel_fp64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TransDataType(kernel_int32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TransDataType(kernel_int64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TransDataType(kernel_bool, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index de389ddabcb86de0155757406a406e44086c5474..0874509a8797cd2ff1b1fcb347b4ef3b74a39047 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -18,42 +18,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, GPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto cpu_place = CPUPlace();
-  auto gpu_place = CUDAPlace(0);
-  CUDADeviceContext context(gpu_place);
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto cpu_place = paddle::platform::CPUPlace();
+  auto gpu_place = paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
 
-    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     float arr[6] = {0, 1, 2, 3, 4, 5};
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(in_ptr, arr, sizeof(arr));
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
-
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
-    float16 arr[6] = {float16(0), float16(1), float16(2),
-                      float16(3), float16(4), float16(5)};
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
+
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
+    paddle::platform::float16 arr[6] = {
+        paddle::platform::float16(0), paddle::platform::float16(1),
+        paddle::platform::float16(2), paddle::platform::float16(3),
+        paddle::platform::float16(4), paddle::platform::float16(5)};
+
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(ptr, arr, sizeof(arr));
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     float* out_data_float = out.data<float>();
@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int64_t* out_data_int64 = out.data<int64_t>();
@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     bool* out_data_bool = out.data<bool>();
@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double =
-        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    double* in_data_double = in.mutable_data<double>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 897e41f79f4e3bb9cecbe7b42fc6c4fd3401d839..96c181f983a33961e3d5fb8745740f2fdbb210de 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,29 +2,37 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
+cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
+
 if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
 else()
     set(multi_devices_graph_builder_deps)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
 endif()
+
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
-cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
-
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
+cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 7d29012380e1b1710704d71a28d21dcc3097eb51..33e02ab65a251a338225ee621ff14acbb0631992 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,95 +13,77 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
-Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
-
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  // the input may have dummy var.
-  std::vector<VarHandle *> in_var_handle;
-  for (auto *in : inputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(in);
-    if (out_handle) {
-      in_var_handle.push_back(out_handle);
-    }
-  }
-  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
-                    "The number of input should be one.");
-
-  // the output may have dummy var.
-  std::vector<VarHandle *> out_var_handles;
-  for (auto *out : outputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
-      out_var_handles.push_back(out_handle);
-    }
+  // the input and output may have dummy var.
+  VarHandle *in_var_handle;
+
+  {
+    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+                      "The number of input should be one.");
+    in_var_handle = in_var_handles[0];
   }
 
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
   PADDLE_ENFORCE_EQ(
       out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // Wait input done, this Wait is asynchronous operation
-  auto &in_place = in_var_handle[0]->place_;
-  if (in_var_handle[0]->generated_op_) {
-    for (auto *out : out_var_handles) {
-      auto &out_p = out->place_;
-      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
-    }
+  // Wait input done, this Wait is asynchronous operation platform::Place
+  // &in_place;
+  WaitInputVarGenerated(*in_var_handle);
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
   }
 
-  //
-  auto in_scope_idx = in_var_handle[0]->scope_idx_;
-  auto in_var =
-      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
-  Tensor *in_tensor = GetTensorFromVar(in_var);
+  auto *in_var =
+      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(in_var);
+
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
   for (auto *out : out_var_handles) {
-    auto &out_p = out->place_;
-    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+    if (*out == *in_var_handle) {
+      continue;
+    }
 
-    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+    auto &out_p = out->place_;
+    auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
                       "Places must be all on CPU or all on CUDA.");
 
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto &in_sr = in_var->Get<framework::SelectedRows>();
-      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
-      if (&in_sr == out_sr) continue;
-      out_sr->set_height(in_sr.height());
-      out_sr->set_rows(in_sr.rows());
-      out_sr->mutable_value()->Resize(in_sr.value().dims());
-      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto in_lod = in_var->Get<framework::LoDTensor>();
-      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
-      if (&in_lod == out_lod) continue;
-      out_lod->set_lod(in_lod.lod());
-      out_lod->Resize(in_lod.dims());
-      out_lod->mutable_data(out_p, in_lod.type());
-    } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
-    }
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
+                                                            in_tensor.type());
 
-    Tensor *out_tensor = GetTensorFromVar(out_var);
-    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
-                                  out_tensor);
+    auto dev_ctx = dev_ctxes_.at(out_p);
+    RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
+      paddle::framework::TensorCopy(
+          in_tensor, out_p, *(dev_ctx),
+          &VariableVisitor::GetMutableTensor(out_var));
+    });
+  }
+}
+
+void BroadcastOpHandle::WaitInputVarGenerated(const VarHandle &in_var) {
+  if (in_var.generated_op_) {
+    for (auto &pair : dev_ctxes_) {
+      in_var.generated_op_->Wait(pair.second);
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index b3292422522b64a38a50f39f04e6f0d2e15492dd..92420f10ac5972b7924d83b43bb28234079e5072 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-
+ public:
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 
@@ -41,8 +39,12 @@ struct BroadcastOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-};
+  void WaitInputVarGenerated(const VarHandle &in_var);
 
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index dfc52b012f8b6bf5cf1a3feab90dc1ec7842ad6c..3f2dcde3e9597287d72046dd4f8b07faab1ede25 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
 struct TestBroadcastOpHandle {
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
   std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
   Scope g_scope_;
   std::unique_ptr<OpHandleBase> op_handle_;
   std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -72,19 +73,20 @@ struct TestBroadcastOpHandle {
   void InitBroadcastOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
     }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("input");
 
     op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
 
-    vars_.emplace_back(new VarHandle());
-    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    in_var_handle->place_ = gpu_list_[input_scope_idx];
-    in_var_handle->name_ = "input";
-    in_var_handle->version_ = 1;
-    in_var_handle->scope_idx_ = input_scope_idx;
-    in_var_handle->generated_op_ = nullptr;
+    auto* in_var_handle =
+        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
     op_handle_->AddInput(in_var_handle);
 
     // add dummy var
@@ -95,13 +97,9 @@ struct TestBroadcastOpHandle {
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      out_var_handle->place_ = gpu_list_[j];
-      out_var_handle->name_ = "out";
-      out_var_handle->version_ = 2;
-      out_var_handle->scope_idx_ = j;
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
     }
 
@@ -114,7 +112,8 @@ struct TestBroadcastOpHandle {
   }
 
   void TestBroadcastLodTensor(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
     auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
     in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
@@ -126,6 +125,7 @@ struct TestBroadcastOpHandle {
     paddle::framework::TensorFromVector<float>(
         send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
     in_lod_tensor->set_lod(lod);
+    in_lod_tensor->Resize(kDims);
 
     op_handle_->Run(false);
 
@@ -133,7 +133,8 @@ struct TestBroadcastOpHandle {
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
       auto out_tensor = out_var->Get<f::LoDTensor>();
       PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
@@ -148,7 +149,8 @@ struct TestBroadcastOpHandle {
   }
 
   void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
     auto value = in_selected_rows->mutable_value();
     value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -171,7 +173,8 @@ struct TestBroadcastOpHandle {
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
       auto& out_select_rows = out_var->Get<f::SelectedRows>();
       auto rt = out_select_rows.value();
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index ff6d91c1dafb0ab4cabb1646cc333e19a89eb812..7ff0efe09387b7e5d7cfe0dfe5e129ca9914d90b 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -28,8 +28,8 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   auto *cur_ctx = dev_ctxes_[place_];
   for (auto *in : inputs_) {
-    bool need_wait =
-        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    bool need_wait = in->generated_op_ &&
+                     in->generated_op_->DeviceContext(place_) != cur_ctx;
     if (need_wait) {
       in->generated_op_->Wait(cur_ctx);
     }
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index d6d2d731ca80a0fbc0a2a34027b5b7c3c1977c07..c363b973d9abbae6bea76c2458fbe82a37a342ca 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -24,10 +27,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  Scope *scope_;
-  platform::Place place_;
-
+ public:
   ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
                       platform::Place place);
 
@@ -35,6 +35,11 @@ struct ComputationOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/container_cast.h b/paddle/fluid/framework/details/container_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ae78dc45c2a885f98315a21f1d5558725bca3
--- /dev/null
+++ b/paddle/fluid/framework/details/container_cast.h
@@ -0,0 +1,40 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename ResultType, typename ElemType>
+std::vector<ResultType*> DynamicCast(const std::vector<ElemType*>& container) {
+  static_assert(std::is_base_of<ElemType, ResultType>::value,
+                "ElementType must be a base class of ResultType");
+  std::vector<ResultType*> res;
+  for (auto* ptr : container) {
+    auto* derived = dynamic_cast<ResultType*>(ptr);
+    if (derived) {
+      res.emplace_back(derived);
+    }
+  }
+  return res;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
index 69bcea625288eba897e761a1d634f19c41dc0f79..21f75957be5f33f3dfc09c41fa9a1e1ca590f99e 100644
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 
 namespace paddle {
 namespace framework {
@@ -23,7 +23,7 @@ namespace details {
 // Change it to thread safe flags if needed.
 class ThreadUnsafeOwnershipFlags {
  public:
-  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
 
   ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
   ThreadUnsafeOwnershipFlags& operator=(
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index e3e7c55d153aec8ce9c25c962821b266eaa84fe4..946ee91a667496e2427304df4228334bb1061890 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
     auto *var = static_cast<VarHandle *>(input);
     var->generated_op_->Wait(cpu_ctx);
   }
-
   tensors_.resize(inputs_.size());
-  auto *var = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var->name_;
+  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
+  auto &var_name = var_handle->name_;
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(kLocalExecScopeName)
-                  ->Get<Scope *>()
-                  ->FindVar(var_name)
-                  ->Get<framework::LoDTensor>();
-    if (platform::is_gpu_place(var->place_)) {
+    auto *var =
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
+                            var_name);
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
-      dev_ctxes_[t.place()]->Wait();
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 904b2d669f8b156b99197afb0155380d1170a68b..b49f3df338dc11310a4a0c27c8aaae3602373fcc 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,11 +27,7 @@ namespace framework {
 namespace details {
 
 struct FetchOpHandle : public OpHandleBase {
-  FeedFetchList *data_;
-  size_t offset_;
-  std::vector<Scope *> *local_scopes_;
-  std::vector<LoDTensor> tensors_;
-
+ public:
   FetchOpHandle(FeedFetchList *data, size_t offset,
                 std::vector<Scope *> *local_scopes);
 
@@ -42,6 +41,12 @@ struct FetchOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 8dd85be567d33991ac003707fec939a61a2d0962..3ed7723919fc3a547b15c28b846de758a8155e66 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
@@ -23,46 +25,40 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
     : local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
-  // the input may have dummy var.
-  std::vector<VarHandle *> in_var_handles;
-  for (auto *in : inputs_) {
-    auto *in_handle = dynamic_cast<VarHandle *>(in);
-    if (in_handle) {
-      in_var_handles.push_back(in_handle);
-    }
-  }
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // the output may have dummy var.
-  std::vector<VarHandle *> out_var_handles;
-  for (auto *out : outputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
-      out_var_handles.push_back(out_handle);
-    }
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
   }
-  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                    "The number of output should be one.");
 
-  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
-  auto pre_in_var =
-      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
-  auto pre_place = in_0_handle->place_;
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
 
+  auto in_0_handle = in_var_handles[0];
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
   PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                  "Currently, gather_op only can gather SelectedRows.");
 
-  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+  auto pre_place = in_0_handle->place_;
+  PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
                     "The place of input and output should be the same.");
 
   // Wait input done, this Wait is asynchronous operation
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      in->generated_op_->Wait(dev_ctxes_[in->place_]);
-    }
-  }
+  WaitInputVarGenerated(in_var_handles);
 
   std::vector<int64_t> out_rows;
   std::vector<Tensor> in_tensors;
@@ -70,34 +66,32 @@ void GatherOpHandle::RunImpl() {
 
   auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
   // gather the inputs
-  for (auto *in : in_var_handles) {
-    auto in_handle = static_cast<VarHandle *>(in);
+  for (auto *in_handle : in_var_handles) {
     auto in_p = in_handle->place_;
     in_places.push_back(in_p);
     PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                       "Places must be all on CPU or all on CUDA.");
-    auto in_var =
-        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    auto *in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
     auto &in_sr = in_var->Get<framework::SelectedRows>();
 
     PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                       "The type of input is not consistent.");
     PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
                       "The height of inputs is not consistent.");
-    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(),
                       "The dims of inputs is not consistent.");
 
-    auto in_sr_rows = in_sr.rows();
+    auto &in_sr_rows = in_sr.rows();
     out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
 
     in_tensors.emplace_back(in_sr.value());
   }
 
   // write the output
-  auto &out_place = out_var_handles[0]->place_;
-  auto out_scope_idx = out_var_handles[0]->scope_idx_;
-  auto out_var =
-      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+  auto &out_place = out_var_handle->place_;
+  auto out_scope_idx = out_var_handle->scope_idx_;
+  auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
 
   auto out = out_var->GetMutable<framework::SelectedRows>();
   out->set_height(pre_in.height());
@@ -110,13 +104,27 @@ void GatherOpHandle::RunImpl() {
   Tensor *out_tensor = out->mutable_value();
 
   // copy
-  int s = 0, e = 0;
-  for (size_t j = 0; j < in_tensors.size(); ++j) {
-    e += in_tensors[j].dims()[0];
-    auto sub_out = out_tensor->Slice(s, e);
-    paddle::framework::TensorCopy(in_tensors[j], out_place,
-                                  *(dev_ctxes_[in_places[j]]), &sub_out);
-    s = e;
+  auto dev_ctx = dev_ctxes_[out_place];
+  RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] {
+    int s = 0, e = 0;
+    for (size_t j = 0; j < in_tensors.size(); ++j) {
+      e += in_tensors[j].dims()[0];
+      auto sub_out = out_tensor->Slice(s, e);
+      paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx),
+                                    &sub_out);
+      s = e;
+    }
+  });
+}
+
+void GatherOpHandle::WaitInputVarGenerated(
+    const std::vector<VarHandle *> &in_var_handles) {
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      for (auto pair : dev_ctxes_) {
+        in->generated_op_->Wait(pair.second);
+      }
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index 6c0231f642c05e6b558b7e2518a15e08c816fe4b..c394dd7a14b07cb956aa1aedfc0df4fa25744dd7 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 
 struct GatherOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-
+ public:
   GatherOpHandle(const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places);
 
@@ -41,6 +39,11 @@ struct GatherOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 10839f239d59e97946575297a6d125968a1458f4..3cce2cc1640b3866130126424ff8fef18b8befc6 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
 struct TestGatherOpHandle {
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
   std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
   Scope g_scope_;
   std::unique_ptr<OpHandleBase> op_handle_;
   std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -71,21 +72,21 @@ struct TestGatherOpHandle {
   void InitGatherOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
     }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("out");
 
     op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      in_var_handle->place_ = gpu_list_[j];
-      in_var_handle->name_ = "input";
-      in_var_handle->version_ = 1;
-      in_var_handle->scope_idx_ = j;
-      in_var_handle->generated_op_ = nullptr;
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
 
@@ -97,12 +98,9 @@ struct TestGatherOpHandle {
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    vars_.emplace_back(new VarHandle());
-    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    out_var_handle->place_ = gpu_list_[input_scope_idx];
-    out_var_handle->name_ = "out";
-    out_var_handle->version_ = 2;
-    out_var_handle->scope_idx_ = input_scope_idx;
+    auto* out_var_handle =
+        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
     // add dummy var
@@ -123,7 +121,8 @@ struct TestGatherOpHandle {
 
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
-      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -136,10 +135,11 @@ struct TestGatherOpHandle {
       value->Resize(kDims);
     }
 
-    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
-    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
 
     out_selected_rows->mutable_value()->ShareDataWith(
@@ -163,7 +163,8 @@ struct TestGatherOpHandle {
     f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
     float* ct = result_tensor.data<float>();
 
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
+    for (int64_t j = 0;
+         j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
   }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 5a95cbc53625888bac539f91af391ff0babec17b..002952436e58eecfcecf5c9fa40c01b795170681 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -60,7 +60,8 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
                                                 const platform::Place &p,
                                                 const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
 
   auto var_names = op.InputArgumentNames();
 
@@ -89,105 +90,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
-    if (!is_forwarding) {
-      // FIXME(yy): Do not hard code like this
-      if (op->OutputArgumentNames().size() == 1 &&
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
-        continue;  // Drop fill 1. for backward coeff;
-      }
-    }
-
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
-    }
-
-    for (size_t i = 0; i < places_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
-      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, *op, p, i);
-
-      auto var_names = op->OutputArgumentNames();
-
-      if (is_forwarding) {
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
-      }
-    }
-
-    if (change_forward) {
+    if (op->Type() == "send") {
+      // append send op if program is distributed trainer main program.
+      // always use the first device
+      CreateSendOp(&result, *op);
+    } else if (IsScaleLossOp(*op)) {
+      CreateScaleLossGradOp(&result);
       is_forwarding = false;
-    }
-
-    if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once. But there are no
-      // other cases, for example, we need to adjust the gradient according to
-      // the input when we get the gradient, which is not considered at present.
-      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0 &&
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-
-            vars.emplace_back(new VarHandle);
-            auto &var = vars.back();
-            var->place_ = p;
-            var->name_ = og;
-            var->version_ = vars.size() - 1;
-
-            op_handle->AddOutput(var.get());
+    } else {
+      CreateComputationalOps(&result, *op);
+      if (!is_forwarding) {
+        // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
+        // present.
+        for (auto &og : op->OutputArgumentNames()) {
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
+            InsertNCCLAllReduceOp(&result, og);
           }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
         }
       }
     }
@@ -211,7 +132,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
 
   return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op) const {
+  for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index f1518d75b421006db6311c3b0f602e47000ab381..b5ba2dbd3c00f23fabd993d7908664db38a31941 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
+
+  bool IsScaleLossOp(const OpDesc &op) const;
+
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 1e48f75958a3ada4d1cd5c8d0f920da4fed2157e..b055bb48f608c9fd9cc671d175cb463d25dc489b 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-
 #include <algorithm>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
 
 namespace paddle {
 namespace framework {
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
   }
 }
 
-struct ReduceLoDTensor {
-  const std::vector<LoDTensor> &src_tensors_;
-  LoDTensor &dst_tensor_;
-
-  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
-      : src_tensors_(src), dst_tensor_(*dst) {}
-
-  template <typename T>
-  void operator()() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
-    auto &t0 = src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
-    dst_tensor_.Resize(t0.dims());
-    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
-      auto &t = src_tensors_[i];
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
-      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
-
 void NCCLAllReduceOpHandle::RunImpl() {
   if (inputs_.size() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -69,20 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
     int dtype = -1;
     size_t numel = 0;
 
-    std::vector<LoDTensor> lod_tensors;
+    std::vector<const LoDTensor *> lod_tensors;
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
-      lod_tensors.emplace_back(lod_tensor);
+      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+      lod_tensors.emplace_back(&lod_tensor);
     }
 
-    if (platform::is_gpu_place(lod_tensors[0].place())) {
+    if (platform::is_gpu_place(lod_tensors[0]->place())) {
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = places_[i];
-        auto &lod_tensor = lod_tensors[i];
+        auto &lod_tensor = *lod_tensors[i];
         void *buffer = const_cast<void *>(lod_tensor.data<void>());
 
         if (dtype == -1) {
@@ -110,17 +85,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
         }
       });
     } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg =
-          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+      auto &trg = *this->local_scopes_[0]
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->Var()
+                       ->GetMutable<framework::LoDTensor>();
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
 
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &scope = local_scopes_[i];
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope->FindVar(var_name);
+        auto *var = scope.FindVar(var_name);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index ad14a3c5cb4625fa121cad2daed389c441e78771..a0c321843e3fc5abcbd1ef2ce2e153250269aa7d 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -27,10 +27,6 @@ namespace framework {
 namespace details {
 
 struct NCCLAllReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
-
   NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
                         const platform::NCCLContextMap &ctxs);
@@ -43,6 +39,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index a9a6c8d39cf8741f7d9c91579a650ad742cec381..00f213f3ed294adcce7c540e3ff346de8e2be7fb 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -27,28 +27,15 @@ namespace details {
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
 class OpHandleBase {
- private:
-  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
-
  public:
-  std::vector<VarHandleBase *> inputs_;
-  std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
-
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
-#endif
-
   OpHandleBase() {}
 
+  virtual ~OpHandleBase();
+
   std::string DebugString() const;
 
   virtual std::string Name() const = 0;
 
-  virtual ~OpHandleBase();
-
   void Run(bool use_event);
 
   virtual void Wait(platform::DeviceContext *waited_dev);
@@ -61,6 +48,18 @@ class OpHandleBase {
   // will likely block other computations.
   virtual bool IsMultiDeviceTransfer() { return false; }
 
+  const platform::DeviceContext *DeviceContext(platform::Place place) {
+    return dev_ctxes_[place];
+  }
+
+  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
+    dev_ctxes_[place] = ctx_;
+  }
+
+  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+
+  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
@@ -68,6 +67,18 @@ class OpHandleBase {
                          const std::function<void()> &callback);
 
   virtual void RunImpl() = 0;
+
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index d73604ad185a66ade0168f585d1951d0d7d4a5f9..06603db31e0092382c0cc05482a038473d647ef1 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b95a284990da8f9b7c16d6e4221eb1ed061f74b
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -0,0 +1,94 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceLoDTensor {
+  const std::vector<const LoDTensor *> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = *src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = *src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
+inline void GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selecte_rows_,
+    const std::vector<platform::Place> &in_places,
+    const std::unordered_map<platform::Place, platform::DeviceContext *,
+                             platform::PlaceHash> &dev_ctxes,
+    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+
+  std::vector<Tensor> in_tensors;
+  std::vector<int64_t> out_rows;
+
+  for (auto in_sr_ptr : src_selecte_rows_) {
+    auto &in_sr = *in_sr_ptr;
+    in_tensors.emplace_back(in_sr.value());
+    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+  }
+
+  auto &pre_in = src_selecte_rows_[0];
+
+  auto &dst_tensor = *dst_selecte_rows;
+  dst_tensor.set_height(pre_in->height());
+  dst_tensor.set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in->GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  dst_tensor.mutable_value()->Resize(out_dim);
+  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+  Tensor *out_tensor = dst_tensor.mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes.at(in_places[j])), &sub_out);
+    s = e;
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..409e8f72b841de03dcb50e62de447ae9895df2c0
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -0,0 +1,169 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void ReduceOpHandle::RunImpl() {
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
+  }
+
+  auto in_0_handle = in_var_handles[0];
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  // Wait input done, this Wait is asynchronous operation
+  WaitInputVarGenerated(in_var_handles);
+  auto pre_place = in_0_handle->place_;
+  std::vector<platform::Place> in_places;
+  auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
+  for (auto *in_handle : in_var_handles) {
+    auto in_p = in_handle->place_;
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    in_places.emplace_back(in_p);
+
+    auto in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+
+    auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
+    PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
+                      "The type of input is not consistent.");
+  }
+
+  auto out_var =
+      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  if (pre_in_var->IsType<framework::SelectedRows>()) {
+    std::vector<const SelectedRows *> in_selected_rows =
+        GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+
+    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                       out_var_handle->place_,
+                       out_var->GetMutable<framework::SelectedRows>());
+  } else {
+    std::vector<const LoDTensor *> lod_tensors =
+        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
+    if (paddle::platform::is_cpu_place(pre_place)) {
+      ReduceLoDTensor func(lod_tensors,
+                           out_var->GetMutable<framework::LoDTensor>());
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+    } else if (paddle::platform::is_gpu_place(pre_place)) {
+#ifdef PADDLE_WITH_CUDA
+      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
+      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
+      VariableVisitor::GetMutableTensor(out_var).mutable_data(
+          out_var_handle->place_, pre_in.type());
+
+      auto out_p = out_var_handle->place_;
+      int root = boost::get<platform::CUDAPlace>(out_p).device;
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < var_scopes.size(); ++i) {
+        auto &p = in_places[i];
+        auto &lod_tensor = *lod_tensors[i];
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *recvbuffer = nullptr;
+        if (root == dev_id) {
+          recvbuffer =
+              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
+                  out_var_handle->place_);
+        }
+
+        int type = platform::ToNCCLDataType(lod_tensor.type());
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
+              static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream));
+        });
+      }
+
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+    }
+  }
+}
+
+template <typename T>
+std::vector<const T *> ReduceOpHandle::GetInputValues(
+    const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<const Scope *> &var_scopes) const {
+  std::vector<const T *> in_selected_rows;
+  for (auto *in_handle : in_var_handles) {
+    auto &in_sr = var_scopes.at(in_handle->scope_idx_)
+                      ->FindVar(in_handle->name_)
+                      ->Get<T>();
+    in_selected_rows.emplace_back(&in_sr);
+  }
+  return in_selected_rows;
+}
+
+void ReduceOpHandle::WaitInputVarGenerated(
+    const std::vector<VarHandle *> &in_var_handles) {
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      for (auto pair : dev_ctxes_) {
+        in->generated_op_->Wait(pair.second);
+      }
+    }
+  }
+}
+
+std::string ReduceOpHandle::Name() const { return "reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..9746b3bdbde14d24a83a27a593c5f1ebfec201ff
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+
+  template <typename T>
+  std::vector<const T *> GetInputValues(
+      const std::vector<VarHandle *> &in_var_handles,
+      const std::vector<const Scope *> &var_scopes) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c17aabee53680fba10eac289cf8f8bd5f7d419e8
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -0,0 +1,286 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestReduceOpHandle {
+  bool use_gpu_;
+  Scope g_scope_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<Scope *> param_scopes_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitReduceOp(size_t out_scope_idx) {
+    // init scope
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope &local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope *>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[out_scope_idx]->Var("out");
+
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
+
+    // init op handle
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      }
+      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      in_var_handle->generated_op_ = nullptr;
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *in_dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    auto *out_var_handle =
+        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    vars_.emplace_back(out_var_handle);
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestReduceSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+
+  void TestReduceLodTensors(size_t output_scope_idx) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+      in_lod_tensor->set_lod(lod);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_lodtensor = in_var->Get<f::LoDTensor>();
+
+    out_lodtensor->ShareDataWith(in_lodtensor);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &rt = out_var->Get<f::LoDTensor>();
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
+    }
+  }
+};
+
+TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+TEST(ReduceTester, TestCPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+
+TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+
+TEST(ReduceTester, TestGPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 7fb9f99a8a1bc044e2f25f373265a5ec9f7d76d5..7a65ee62c9bfc0dad2ebee3be21de825fa405d73 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 void ScaleLossGradOpHandle::RunImpl() {
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp =
-      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
-          make_ddim({1}), place_);
+  float *tmp = local_scope.FindVar(var_name)
+                   ->GetMutable<LoDTensor>()
+                   ->mutable_data<float>(make_ddim({1}), place_);
 
   if (platform::is_cpu_place(place_)) {
     *tmp = coeff_;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index ab7353a4fc56bebfe04696efd838dc4559218058..d93d599d46f130cf98f39f15697ce994a31e20c3 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -23,10 +25,6 @@ namespace framework {
 namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  float coeff_;
-  Scope *scope_;
-  platform::Place place_;
-
   ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
                         platform::DeviceContext *context);
 
@@ -36,6 +34,11 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index 173f9d726145aeb9e85cc0fb9056eb57bf484098..2f78811fad50642b5e45776c41910df6f4cc48f6 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -28,10 +28,6 @@ namespace framework {
 namespace details {
 
 struct SendOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  const Scope* local_scope_;
-  const platform::Place& place_;
-
   SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
                const platform::Place& place);
 
@@ -43,6 +39,11 @@ struct SendOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index be5fb7577581fd99b1b7b80ccdd2acb8d3a91f01..6a567527550883add08031e50aa8de2b204cf13d 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
   auto &var_holder = var_holders[each_var_name];
   VarHandle *var = nullptr;
   if (var_holder.empty()) {
-    var_holder.emplace_back(new VarHandle);
-    auto &init_var = var_holder[0];
-    init_var->place_ = place;
-    init_var->name_ = each_var_name;
-    init_var->generated_op_ = nullptr;
-    init_var->version_ = 0;
-    var = init_var.get();
+    var = new VarHandle(0, place_offset, each_var_name, place);
+    var_holder.emplace_back(var);
   } else {
     var = var_holder.rbegin()->get();
   }
@@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
                                      size_t place_offset) {
   auto &vars = graph->vars_[place_offset][each_var_name];
   size_t version = vars.size();
-  vars.emplace_back(new VarHandle());
-  auto &var = vars.back();
-  var->version_ = version;
-  var->name_ = each_var_name;
-  var->place_ = place;
-  op_handle->AddOutput(var.get());
+  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
 }
 
 template <typename Callback>
@@ -125,12 +117,12 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
-    for (auto in : op->inputs_) {
+    for (auto in : op->Inputs()) {
       std::string var_name = "var_" + std::to_string(vars[in]);
       sout << var_name << " -> " << op_name << std::endl;
     }
 
-    for (auto out : op->outputs_) {
+    for (auto out : op->Outputs()) {
       std::string var_name = "var_" + std::to_string(vars[out]);
       sout << op_name << " -> " << var_name << std::endl;
     }
@@ -141,7 +133,7 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
 
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
-    if (!op->outputs_.empty()) {
+    if (!op->Outputs().empty()) {
       continue;
     }
     auto *dummy_leaf = new DummyVarHandle();
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index a371ee10fe03cda86c316f3503f9cadb8c716ae5..3b7d61607301e685e67b5f4bc97fc837471e5722 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       running_ops_(0),
       allow_op_delay_(allow_op_delay) {}
 
-void ThreadedSSAGraphExecutor::RunDelayedOps(
-    const std::unordered_set<OpHandleBase *> &delayed_ops) {
-  for (auto op : delayed_ops) {
-    op->Run(use_event_);
-  }
-}
-
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
-  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
-  std::unordered_set<VarHandleBase *> delayed_vars;
 
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
     pending_vars.insert(&var);
@@ -62,7 +53,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   };
 
   auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
-    pending_ops.insert({&op_instance, op_instance.inputs_.size()});
+    pending_ops.insert({&op_instance, op_instance.Inputs().size()});
   };
 
   // Transform SSAGraph to pending_ops & pending_vars
@@ -78,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
 
   for (auto &op : graph_->ops_) {
-    if (op->inputs_.empty()) {  // Special case, Op has no input.
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
       InsertPendingOp(*op);
@@ -108,7 +99,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     fetch_ops.emplace_back(op);
 
     for (auto &p : places_) {
-      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
     }
 
     for (auto *var : vars) {
@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     InsertPendingOp(*op);
   }
 
-  auto run_all_ready_ops = [&] {
-    for (auto *op : ready_ops) {
-      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
-        delayed_ops.insert(op);
-        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
-        ready_vars.Extend(op->outputs_);
-        continue;
-      }
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
       running_ops_++;
       RunOp(&ready_vars, op);
     }
-    ready_ops.clear();
+    set.clear();
   };
 
   // Step 3. Execution
-  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
+  while (!pending_vars.empty()) {
     // 1. Run All Ready ops
-    run_all_ready_ops();
+    // Keep loop until all vars are ready.
+    //
+    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
+    // ready_ops have been performed.
+    if (ready_ops.empty() && allow_op_delay_ && running_ops_ == 0) {
+      run_all_ops(delayed_ops);
+    } else {
+      run_all_ops(ready_ops);
+    }
 
     // 2. Find ready variable
     bool timeout;
@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
-            blocked_by_delayed_ops.insert(op);
+          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+            delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
           }
         }
       }
     }
-    // When there are no other ops to schedule, schedule buffered delayed
-    // ops and unblock other ops.
-    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
-      RunDelayedOps(delayed_ops);
-      delayed_ops.clear();
-      for (auto *op : blocked_by_delayed_ops) {
-        ready_ops.insert(op);
-      }
-      blocked_by_delayed_ops.clear();
-    }
-    // Keep loop until all vars are ready.
   }
   PADDLE_ENFORCE(ready_ops.empty());
-  PADDLE_ENFORCE(delayed_ops.empty());
-  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
@@ -200,7 +180,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       op->Run(use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
-      ready_var_q->Extend(op->outputs_);
+      ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bb5e837b135c35b5aea403496b45aab1ccc288ff..d70bbd4ef0eb02d1b473bf88e526996819aec5f9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
              details::OpHandleBase *op);
 
-  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
-
  private:
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 871e41343f53b801a22d3a450f0906f37fb372d1..9f7fd69e64fe9d7ef0bf3037bea7f686cb2eee0b 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -16,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_set>
+#include <utility>
 
 #include "paddle/fluid/platform/place.h"
 
@@ -33,10 +34,10 @@ struct VarHandleBase {
 
   // The operator who generate this variable. nullptr if the variable
   // is a root node.
-  OpHandleBase *generated_op_;
+  OpHandleBase* generated_op_{nullptr};
 
   // Operators which depend on this variable ready.
-  std::unordered_set<OpHandleBase *> pending_ops_;
+  std::unordered_set<OpHandleBase*> pending_ops_;
 };
 
 // VarHandle is actually a single version of Runtime Variable.
@@ -47,12 +48,24 @@ struct VarHandleBase {
 struct VarHandle : public VarHandleBase {
   std::string DebugString() const override;
 
+  VarHandle(size_t version, size_t scope_index, std::string name,
+            platform::Place place)
+      : version_(version),
+        scope_idx_(scope_index),
+        name_(std::move(name)),
+        place_(std::move(place)) {}
+
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
+
+  bool operator==(const VarHandle& o) const {
+    return o.generated_op_ == generated_op_ && o.name_ == name_ &&
+           o.scope_idx_ == scope_idx_;
+  }
 };
 
 // Dummy Variable. It is used to represent dependencies between operators
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10bac0fae9504215fab11dd8cca7c278feaa4bda
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -0,0 +1,93 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename Func>
+static void VisitVariable(Variable* var, Func* func) {
+  if (var->IsType<LoDTensor>()) {
+    (*func)(var->GetMutable<LoDTensor>());
+  } else if (var->IsType<SelectedRows>()) {
+    (*func)(var->GetMutable<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var->Type().name());
+  }
+}
+
+template <typename Func>
+static void VisitVariable(const Variable& var, Func* func) {
+  if (var.IsType<LoDTensor>()) {
+    (*func)(var.Get<LoDTensor>());
+  } else if (var.IsType<SelectedRows>()) {
+    (*func)(var.Get<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var.Type().name());
+  }
+}
+
+struct TensorVisitor {
+  Tensor* result_{nullptr};
+
+  void operator()(LoDTensor* tensor) { result_ = tensor; }
+
+  void operator()(SelectedRows* selected_rows) {
+    result_ = selected_rows->mutable_value();
+  }
+
+  template <typename T>
+  void operator()() {
+    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+  }
+};
+
+Tensor& VariableVisitor::GetMutableTensor(Variable* var) {
+  TensorVisitor vistor;
+  VisitVariable(var, &vistor);
+  return *vistor.result_;
+}
+
+struct ShareDimsAndLoDVisitor {
+  Variable* trg_;
+  void operator()(const LoDTensor& val) {
+    auto* tensor = trg_->GetMutable<LoDTensor>();
+    tensor->set_layout(val.layout());
+    tensor->set_lod(val.lod());
+    tensor->Resize(val.dims());
+  }
+
+  void operator()(const SelectedRows& val) {
+    auto* selected_rows = trg_->GetMutable<SelectedRows>();
+    selected_rows->set_rows(val.rows());
+    selected_rows->set_height(val.height());
+    selected_rows->mutable_value()->Resize(val.value().dims());
+  }
+
+  template <typename T>
+  void operator()(const T&) {
+    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
+                   typeid(T).name());
+  }
+};
+
+void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
+  ShareDimsAndLoDVisitor visitor{trg};
+  VisitVariable(src, &visitor);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..67baa1895e4513738fa73d49c46660da92279b9d
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -0,0 +1,33 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class VariableVisitor {
+ public:
+  static Tensor &GetMutableTensor(Variable *var);
+
+  static void ShareDimsAndLoD(const Variable &src, Variable *trg);
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index a8c3e227db3f8f3781d0acd5e233d7bea1123df1..8e1f93c5ebd448903d70f9668539e077875836e4 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
+#include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/variable.h"
 
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index d6130f421e122047c2f4ed315e6f2fb7484cda1a..7f504bfd232862c014cb59b6e8301eec74e0351f 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
 
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index cf697187d6225f3a1d2506120eebe14d4a41dff9..b4d3fa25c35fbf25b3d2fdd9fa1045dda0f773ec 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
                       " for input argument with a list of variables, "
                       " drop_empty_grad is not allowed because it makes"
                       " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
-                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " ambiguous."
                       " Op type %s",
                       fwd_op_.Type());
 
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
index ef83e71160e0e52071b033ea8b86e6da91bbfad2..8c6e8b0c66ead96f0e53b56ee951887730b0d77f 100644
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iosfwd>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index d99a15547b77a0e0d71b14bd1c798cd1485720b0..29b3396bc9854cd3d3ac8d4283f48019c9a9c55f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <initializer_list>
 #include <vector>
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index eabfdc11a8b314c4af9626ded3edd1bcba212de1..46c834b38b758a2e050d990a464600154cbe51e5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+#include <algorithm>
 #include <functional>
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <string>
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b..cd6777e60a8e354ac634ba1c1fe5db63539f6e93 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -119,7 +119,7 @@ class OpDesc {
 
   void InferVarType(BlockDesc *block) const;
 
-  void MarkAsTarget() { desc_.set_is_target(true); }
+  void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index 3a1036742c206961fe52660106ae947153e9b244..fab20d75f5a45257f243333c1998d7b2549a25f9 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3116b03d0433ddf98613796b272238e5fe72ce6a..c479d7617cfa34cd381d84d15d5e214d57af52d0 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -12,6 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index cf56b0fa1894374956b3011c88bc70acdba1e464..0beb57ce1609d2e90c05d3255647bd321bc1f6a9 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index f1424f13b445155fe4f28732408a2445ab1aa9b7..748317438b44bc4af84f13b25f8e4f88386388fb 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
@@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                          \
   }
 
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with drop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
-
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
   REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 0d791c8583537d410b838c1662755938353052a9..6dc4cf261bad3c004aa53fba5502fe166e3a47f7 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -202,8 +202,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
 };
 
 TEST(OperatorRegistrar, Test) {
-  using namespace paddle::framework;
-  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+  paddle::framework::OperatorRegistrar<
+      CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
+      reg("cos");
 }
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 25f622b725277ac9bcca4622902162f3edf147e8..1bf8c81469bb4afdd00921cfa0acf6089dedbbaa 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -226,10 +226,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 
 // test with multi inputs
 TEST(OpKernel, multi_inputs) {
-  using namespace paddle::framework;
-
   paddle::framework::InitDevices(true);
-  proto::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
@@ -243,12 +241,12 @@ TEST(OpKernel, multi_inputs) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<LoDTensor>();
-  scope.Var("x1")->GetMutable<LoDTensor>();
-  scope.Var("x2")->GetMutable<LoDTensor>();
-  scope.Var("k0")->GetMutable<LoDTensor>();
-  scope.Var("y0")->GetMutable<LoDTensor>();
-  scope.Var("y1")->GetMutable<LoDTensor>();
+  scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c1486b527d2e06d2b3f7e0f89458bf9a22564586..67e02e2f119707bba376056510a8ca1034590b55 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
 #endif
 
   std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
+  bool own_local_scope;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -63,13 +64,16 @@ ParallelExecutor::ParallelExecutor(
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(&scope->NewScope());
+    member_->own_local_scope = true;
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
+    member_->own_local_scope = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(local_scopes[i]);
     }
   }
 
@@ -155,15 +159,13 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
 
-void ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors,
-    const std::string &fetched_var_name,
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  SplitTensorToPlaces(feed_tensors);
-
   // Create local scopes.
-  for (auto &scope : member_->local_scopes_) {
+  for (auto it = member_->local_scopes_.rbegin();
+       it != member_->local_scopes_.rend(); ++it) {
+    auto &scope = *it;
     Scope &local_scope = scope->NewScope();
     *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
         &local_scope;
@@ -177,7 +179,7 @@ void ParallelExecutor::Run(
         InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       } else {
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       }
     }
@@ -195,14 +197,28 @@ void ParallelExecutor::Run(
     auto &local_scope =
         *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
     scope->DeleteScope(local_scope);
-    local_scope = nullptr;
   }
 }
 
-void ParallelExecutor::SplitTensorToPlaces(
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
-  for (auto it : feed_tensors) {
-    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+void ParallelExecutor::FeedTensorsIntoLocalScopes(
+    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &map = tensors[i];
+    auto *scope = member_->local_scopes_[i];
+    for (auto &pair : map) {
+      auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
+      trg->ShareDataWith(pair.second);
+      trg->set_lod(pair.second.lod());
+    }
+  }
+}
+
+void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
+    const std::unordered_map<std::string, LoDTensor> &tensors) {
+  for (auto pair : tensors) {
+    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
     PADDLE_ENFORCE_EQ(
         member_->places_.size(), lod_tensors.size(),
         "The number of samples of current batch is less than the count of "
@@ -211,12 +227,20 @@ void ParallelExecutor::SplitTensorToPlaces(
     for (size_t j = 0; j < member_->places_.size(); ++j) {
       // TODO(panxy0718): Do I need to delete this var?
       auto t =
-          member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
+          member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
       t->ShareDataWith(lod_tensors[j]);
       t->set_lod(lod_tensors[j].lod());
     }
   }
 }
 
+ParallelExecutor::~ParallelExecutor() {
+  if (member_->own_local_scope) {
+    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
+      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b4f16dba858fb279ec23a8a04257dda6651148cc..f4f283bb4b5eafc33619c98b5f30e1e8f453ece3 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,18 +42,26 @@ class ParallelExecutor {
                             const std::vector<Scope*>& local_scopes,
                             bool allow_op_delay);
 
+  ~ParallelExecutor();
+
   std::vector<Scope*>& GetLocalScopes();
 
+  /**
+   * Feed tensors to local scopes. The size of tensors should be equal to the
+   * size of local scopes.
+   */
+  void FeedTensorsIntoLocalScopes(
+      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+
+  void FeedAndSplitTensorIntoLocalScopes(
+      const std::unordered_map<std::string, LoDTensor>& tensors);
+
   void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name,
-           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+           const std::string& fetched_var_name);
 
   void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 
  private:
-  void SplitTensorToPlaces(
-      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
-
   ParallelExecutorPrivate* member_;
 };
 
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 77d17fbbccca0292e21acd5e8fa90448527b95c0..16694bcf76486a9603c41dc19a58dd0a7cb2b719 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   return blocks_.back().get();
 }
 
-proto::ProgramDesc *ProgramDesc::Proto() {
+void ProgramDesc::Flush() {
   for (auto &block : blocks_) {
     block->Flush();
   }
+}
+
+proto::ProgramDesc *ProgramDesc::Proto() {
+  Flush();
   return &desc_;
 }
 
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 4288081be72c44c0fc3584b50c41a270eac9e204..65fa0a0cfd5ba6d9b8765cee1309e118cb74348a 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -51,6 +51,8 @@ class ProgramDesc {
 
   size_t Size() const { return blocks_.size(); }
 
+  void Flush();
+
   proto::ProgramDesc *Proto();
 
   // The output variable of feed_op is referenced as feed_target.
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 194df3e4a8b50700e2be01ce5ebca83b92501fb8..9091713158c8071d5386f14250e3c546284e7fd0 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -26,6 +26,11 @@ DEFINE_bool(benchmark, false,
             "Default cuda is asynchronous device, set to True will"
             "force op run in synchronous mode.");
 
+DEFINE_bool(
+    eager_delete_scope, true,
+    "Delete local scope eagerly. It will reduce GPU memory usage but "
+    "slow down the destruction of variables.(around 1% performance harm)");
+
 namespace paddle {
 namespace framework {
 
@@ -91,13 +96,13 @@ std::vector<std::string> Scope::LocalVarNames() const {
   return known_vars;
 }
 
-void Scope::DeleteScope(Scope* scope) {
+void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_benchmark) {
+  if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
     delete scope;
   } else {
     Async([scope] { delete scope; });
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c8cb70549f1d131b66fa7c6eeb35f3b7151a9e7f..abc82e452d732638a2f7315022074850f299a7ea 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,7 +63,7 @@ class Scope {
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
 
-  void DeleteScope(Scope* scope);
+  void DeleteScope(Scope* scope) const;
 
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index d9d6b7dd67f1c6e4bbd6a4e1a8f0843d4cb93c05..794e7f743413b068119afd5df232bfc2bb91a8c7 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -17,6 +17,52 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct ReAllocateVisitor {
+  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
+      : tensor_(tensor), dims_(dims) {}
+
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    platform::CPUPlace cpu;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  framework::Tensor* tensor_;
+  framework::DDim dims_;
+};
+
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    platform::CPUPlace cpu;
+    memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -69,5 +115,66 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
   TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }
 
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
+                                       framework::Tensor* value) const {
+  PADDLE_ENFORCE(value->IsInitialized(),
+                 "The value tensor should be initialized.");
+  std::vector<int64_t> non_keys;
+  int64_t value_width = value_->numel() / value_->dims()[0];
+  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                    "output tensor should have the same shape with table "
+                    "execpt the dims[0].");
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    int64_t index = Index(keys[i]);
+    if (index == -1) {
+      non_keys.push_back(keys[i]);
+    } else {
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
+    }
+  }
+  return non_keys;
+}
+
+bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
+  if (value_->IsInitialized()) {
+    PADDLE_ENFORCE_EQ(
+        value.type(), value_->type(),
+        "The type of the value should be same with the original value");
+  }
+  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
+                    "The first dim of value should be 1.");
+  auto index = Index(key);
+  bool is_new_key = false;
+  if (index == -1) {
+    rows_.push_back(key);
+    index = rows_.size() - 1;
+    is_new_key = true;
+    // whether need to resize the table
+    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+      auto dims = value_->dims();
+      dims[0] = (dims[0] + 1) << 1;
+      framework::VisitDataType(framework::ToDataType(value.type()),
+                               ReAllocateVisitor(value_.get(), dims));
+    }
+  }
+
+  framework::VisitDataType(
+      framework::ToDataType(value.type()),
+      TensorCopyVisitor(value_.get(),
+                        index * value_->numel() / value_->dims()[0], value,
+                        static_cast<int64_t>(0), value.numel()));
+  return is_new_key;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 8e2d9470d3954e0f66c74828a8d8292c2875a8f4..d6c9507b1681855e759a4b1b9d3dddf6fcb2fc13 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -14,15 +14,33 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace paddle {
 namespace framework {
 
 class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
  public:
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
@@ -50,12 +68,45 @@ class SelectedRows {
 
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
-  /**
-   * get the index of id in rows
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of keys which does not exists in table
+   */
+  std::vector<int64_t> Get(std::vector<int64_t> keys,
+                           framework::Tensor* tensor) const;
+
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
    */
-  int64_t index(int64_t id) const {
-    auto it = std::find(rows_.begin(), rows_.end(), id);
-    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
 
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 960d8d64f04a819217413ff881977ce5fb5a30f2..39fe6d92940606084c28eec1a4d6486cb58844ce 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -17,7 +17,7 @@ namespace framework {
 
 class SelectedRowsTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     std::vector<int64_t> rows{0, 4, 7};
     int64_t height = 10;
     int64_t row_numel = 100;
@@ -59,5 +59,40 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
+TEST_F(SelectedRowsTester, Table) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+  // initialize a sparse table
+  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
+  table.mutable_value()->mutable_data<float>(cpu);
+  table.mutable_rows()->push_back(1);
+
+  int64_t key = 10000;
+  int64_t non_key = 999;
+  framework::Tensor value;
+  value.Resize(framework::make_ddim({1, 100}));
+  auto ptr = value.mutable_data<float>(cpu);
+  ptr[0] = static_cast<float>(10);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
+  ASSERT_EQ(table.HasKey(key), false);
+
+  table.Set(key, value);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
+  ASSERT_EQ(table.HasKey(key), true);
+  // check re-allocate
+  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+
+  framework::Tensor get_value;
+  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
+  std::vector<int64_t> keys({non_key, key});
+  auto non_keys = table.Get(keys, &get_value);
+
+  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
+  ASSERT_EQ(non_keys.size(), static_cast<size_t>(1));
+  ASSERT_EQ(non_keys[0], non_key);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index dc9a79020f103dadfd9837cffb18ad5946f95f31..ddff2c7c261746ac9986e79cff3da7e0a9654adc 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -11,8 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/framework/shape_inference.h"
-#include "grad_op_desc_maker.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index bc02d700da5186cea5f370b9676e408f62a66a68..46c8feec001584a872f7f62682080e0e72c06f50 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 38b6d1c5c46dcce718f91d574ceea5de2099b787..78b165ebed13cbae791b922e8820cd9551dfd198 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 9687a86ca25be7886e67028a38e54b3065c8e4b5..6e10885890cd2d4a0d77834944b37e291197b637 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -105,16 +105,14 @@ TEST(TensorCopy, Tensor) {
 }
 
 TEST(TensorFromVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
+    paddle::framework::Tensor cpu_tensor;
 
     // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
+    cpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
 
     // Compare Tensors
     const int* cpu_ptr = cpu_tensor.data<int>();
@@ -125,8 +123,8 @@ TEST(TensorFromVector, Tensor) {
     }
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
     ASSERT_NE(src_ptr, cpu_ptr);
@@ -140,23 +138,23 @@ TEST(TensorFromVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
+    paddle::framework::Tensor cpu_tensor;
+    paddle::framework::Tensor gpu_tensor;
+    paddle::framework::Tensor dst_tensor;
 
     // Copy to CPU Tensor
     cpu_tensor.Resize(make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
+    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -172,11 +170,11 @@ TEST(TensorFromVector, Tensor) {
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
 
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -197,18 +195,16 @@ TEST(TensorFromVector, Tensor) {
 }
 
 TEST(TensorToVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    paddle::framework::Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, paddle::platform::CPUPlace());
     for (int i = 0; i < 3 * 3; ++i) {
       src_ptr[i] = i;
     }
 
-    CPUPlace place;
+    paddle::platform::CPUPlace place;
     std::vector<int> dst;
-    TensorToVector<int>(src, &dst);
+    paddle::framework::TensorToVector<int>(src, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_ptr[i], dst[i]);
@@ -217,13 +213,13 @@ TEST(TensorToVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor gpu_tensor;
-    CUDAPlace place;
-    CUDADeviceContext gpu_ctx(place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
-    TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
+    paddle::framework::TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_vec[i], dst[i]);
@@ -233,54 +229,54 @@ TEST(TensorToVector, Tensor) {
 }
 
 TEST(TensorContainsNAN, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    float* buf = src.mutable_data<float>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    float* buf = src.mutable_data<float>({3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1] = NAN;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1].x = 0x7fff;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 }
 
 TEST(TensorContainsInf, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    double* buf = src.mutable_data<double>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = INFINITY;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1].x = 0x7c00;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index 4766ec28aa3cff6be3259f258f1c9543ae471f5d..b4cff1e6c2293fa44f0fd0bb398a538c08dd4fb1 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -45,9 +45,8 @@ static __global__ void FillInf(platform::float16* buf) {
 }
 
 TEST(TensorContainsNAN, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -58,7 +57,8 @@ TEST(TensorContainsNAN, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsNAN(tensor));
@@ -66,9 +66,8 @@ TEST(TensorContainsNAN, GPU) {
 }
 
 TEST(TensorContainsInf, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -79,7 +78,8 @@ TEST(TensorContainsInf, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsInf(tensor));
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 4da83d630a5632233ddff6f08174dcabc1c696f8..27a4ffd4fcbf293a3dea1744b29384d0bee0c137 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -15,14 +15,14 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <atomic>
 
-#include "threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 
 namespace framework = paddle::framework;
 
-void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
   std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
+    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
   }
 }
 
@@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
   int n = 50;
   // sum = (n * (n + 1)) / 2
   for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, std::ref(sum), i);
+    std::thread t(do_sum, pool, &sum, i);
     threads.push_back(std::move(t));
   }
   for (auto& t : threads) {
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index f62415fda67a506763494886eb499fbb09c5caa6..9f7a21ef42b8d3e74b6e211d6254294ba1fa2341 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 1dced845ed7849d9f5a6de16dfe627d52fdb5488..9e33003b442762210c990b35f30bc3524963b8b4 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/var_type_inference.h"
+#include <string>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 87ddfe2ff9abfa3f4d99033686b197b10d8231fa..067e0c2b8389f88639fd9b95bd680702517efee1 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <memory>
+#include <string>
 #include <typeindex>
 #include <typeinfo>
 
@@ -67,7 +68,7 @@ class Variable {
   // parameter of Variable.
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+    explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
 
     virtual const std::type_info& Type() const { return type_; }
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 8494edee6c2c714c285c45bbb4fe1d8cb1a524aa..cc45bfe9b17d767be039cc0d8d83234b6994d6c1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
-  if (WITH_TENSORRT)
+  if (TENSORRT_FOUND)
     add_subdirectory(tensorrt)
   endif()
 endif()
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 3b58019db6e55fa8198d2f77731095c6cf356266..78d2f16746cf478c4424df929bd1f62b08f8a67c 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/io.h"
 
+#include <algorithm>
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -27,14 +28,14 @@ namespace inference {
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 
-void ReadBinaryFile(const std::string& filename, std::string& contents) {
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
   fin.seekg(0, std::ios::end);
-  contents.clear();
-  contents.resize(fin.tellg());
+  contents->clear();
+  contents->resize(fin.tellg());
   fin.seekg(0, std::ios::beg);
-  fin.read(&contents[0], contents.size());
+  fin.read(&(contents->at(0)), contents->size());
   fin.close();
 }
 
@@ -47,7 +48,7 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -92,18 +93,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
     op->CheckAttrs();
   }
 
-  executor.Run(*load_program, &scope, 0, true, true);
+  executor->Run(*load_program, scope, 0, true, true);
 
   delete load_program;
 }
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -113,11 +114,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor, framework::Scope& scope,
+    framework::Executor* executor, framework::Scope* scope,
     const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 756c936b33ad55e2994542b171b945e248ba2e21..ba3e45099ae7c1626bf11d9527d4fa4c7f772fec 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -27,17 +27,17 @@ namespace inference {
 
 void Init(bool init_p2p);
 
-void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index c3a8d0889c6a6dd9591837ccc523da56f8d13661..117472599f7c4874ab05e29c6ecb46fd61d0db9c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -133,12 +133,12 @@ void TestInference(const std::string& dirname,
       std::string prog_filename = "__model_combined__";
       std::string param_filename = "__params_combined__";
       inference_program = paddle::inference::Load(
-          executor, *scope, dirname + "/" + prog_filename,
+          &executor, scope, dirname + "/" + prog_filename,
           dirname + "/" + param_filename);
     } else {
       // Parameters are saved in separate files sited in the specified
       // `dirname`.
-      inference_program = paddle::inference::Load(executor, *scope, dirname);
+      inference_program = paddle::inference::Load(&executor, scope, dirname);
     }
   }
   // Disable the profiler and print the timing information
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7d6781c2c38822eaabb64eda9c76ff657bbdeeb8..256aded8ca234a24229e11f27b9e3e25728ad293 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -110,12 +110,12 @@ function(op_library TARGET)
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
@@ -163,7 +163,12 @@ function(op_library TARGET)
 
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
     endif()
 endfunction()
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index b261144f3d7836801e0b7a45a1478d3b801db86d..87ef55c50b0be46492a695928625d140345d415d 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,11 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include <string>
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
 
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                  \
+  class OP_NAME##OpMaker                                                   \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {               \
+   public:                                                                 \
+    OP_NAME##OpMaker(OpProto *proto, OpAttrChecker *op_checker)            \
+        : ::paddle::framework::OpProtoAndCheckerMaker(proto, op_checker) { \
+      AddInput("X", "Input of " #OP_NAME "operator");                      \
+      AddOutput("Out", "Output of" #OP_NAME "operator");                   \
+      AddAttr<bool>("use_mkldnn",                                          \
+                    "(bool, default false) Only used in mkldnn kernel")    \
+          .SetDefault(false);                                              \
+      AddComment(#OP_COMMENT);                                             \
+    }                                                                      \
+  }
+
+#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
+  class OP_NAME##GradMaker                                                   \
+      : public ::paddle::framework::SingleGradOpDescMaker {                  \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
+      auto *op = new ::paddle::framework::OpDesc();                          \
+      op->SetType(#KERNEL_TYPE "_grad");                                     \
+      op->SetInput("Out", Output("Out"));                                    \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+                                                                             \
+      op->SetAttrMap(Attrs());                                               \
+                                                                             \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -37,346 +74,190 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Out", "Output of Sigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
 $$out = \frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Out", "Output of LogSigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
 $$out = \log \frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Exp operator");
-    AddOutput("Out", "Output of Exp operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.
 
 $out = e^x$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Relu operator");
-    AddOutput("Out", "Output of Relu operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.
 
 $out = \max(x, 0)$
 
-)DOC");
-  }
-};
-
-class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Out", "Output of LeakyRelu operator");
-    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
-    AddComment(R"DOC(
-LeakyRelu Activation Operator.
-
-$out = \max(x, \alpha * x)$
-
-)DOC");
-  }
-};
-
-class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softshrink operator");
-    AddOutput("Out", "Output of Softshrink operator");
-    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
-    AddComment(R"DOC(
-Softshrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+)DOC";
 
-)DOC");
-  }
-};
-
-class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Tanh operator");
-    AddOutput("Out", "Output of Tanh operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
 $$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Out", "Output of TanhShrink operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
 $$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
-
-class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of HardShrink operator");
-    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardShrink Activation Operator.
+)DOC";
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sqrt operator");
-    AddOutput("Out", "Output of Sqrt operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
 $out = \sqrt{x}$
 
-)DOC");
-  }
-};
+)DOC";
 
-class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Abs operator");
-    AddOutput("Out", "Output of Abs operator");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
 $out = |x|$
 
-)DOC");
-  }
-};
+)DOC";
 
-class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Ceil operator");
-    AddOutput("Out", "Output of Ceil operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
 $out = ceil(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Floor operator");
-    AddOutput("Out", "Output of Floor operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
 $out = floor(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class CosOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CosOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Cosine operator");
-    AddOutput("Out", "Output of Cosine operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
 Cosine Activation Operator.
 
 $out = cos(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class SinOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sine operator");
-    AddOutput("Out", "Output of Sine operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
 Sine Activation Operator.
 
 $out = sin(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Round operator");
-    AddOutput("Out", "Output of Round operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.
 
 $out = [x]$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Out", "Output of Reciprocal operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
 $$out = \frac{1}{x}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Log operator");
-    AddOutput("Out", "Output of Log operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.
 
 $out = \ln(x)$
 
 Natural logarithm of x.
 
-)DOC");
-  }
-};
+)DOC";
+
+__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
+Square Activation Operator.
+
+$out = x^2$
 
-class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+)DOC";
+
+__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+Softplus Activation Operator.
+
+$out = \ln(1 + e^{x})$
+
+)DOC";
+
+__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+Softsign Activation Operator.
+
+$$out = \frac{x}{1 + |x|}$$
+
+)DOC";
+
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Square operator");
-    AddOutput("Out", "Output of Square operator");
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
-Square Activation Operator.
+LeakyRelu Activation Operator.
 
-$out = x^2$
+$out = \max(x, \alpha * x)$
 
 )DOC");
   }
 };
 
-class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softplus operator");
-    AddOutput("Out", "Output of Softplus operator");
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softplus Activation Operator.
+Softshrink Activation Operator.
 
-$out = \ln(1 + e^{x})$
+$$
+out = \begin{cases} 
+    x - \lambda, \text{if } x > \lambda \\
+    x + \lambda, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
 
 )DOC");
   }
 };
 
-class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softsign operator");
-    AddOutput("Out", "Output of Softsign operator");
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
-Softsign Activation Operator.
+HardShrink Activation Operator.
 
-$$out = \frac{x}{1 + |x|}$$
+$$
+out = \begin{cases} 
+    x, \text{if } x > \lambda \\
+    x, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
 
 )DOC");
   }
@@ -553,100 +434,86 @@ $$out = \frac{x}{1 + e^{- \beta x}}$$
   }
 };
 
+REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
+REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
+REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
+REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
+REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
+REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
+REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
+REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc);
+REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
+REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
+REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
+REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
+
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
-            logsigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
-            tanh_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
-            softshrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-            leaky_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
-            hard_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
-            thresholded_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
-            hard_sigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
-            ops::ActivationOpGrad);
+#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
+  __macro(Sigmoid, sigmoid);                 \
+  __macro(Relu, relu);                       \
+  __macro(Exp, exp);                         \
+  __macro(Tanh, tanh);                       \
+  __macro(Ceil, ceil);                       \
+  __macro(Floor, floor);                     \
+  __macro(Sqrt, sqrt);                       \
+  __macro(SoftRelu, soft_relu);              \
+  __macro(Relu6, relu6);                     \
+  __macro(Reciprocal, reciprocal);           \
+  __macro(HardSigmoid, hard_sigmoid);
+
+#define FOR_EACH_OP_FUNCTOR(__macro) \
+  __macro(LogSigmoid, logsigmoid);   \
+  __macro(SoftShrink, softshrink);   \
+  __macro(Abs, abs);                 \
+  __macro(Cos, cos);                 \
+  __macro(Sin, sin);                 \
+  __macro(Round, round);             \
+  __macro(Log, log);                 \
+  __macro(Square, square);           \
+  __macro(BRelu, brelu);             \
+  __macro(Pow, pow);                 \
+  __macro(STanh, stanh);             \
+  __macro(Softplus, softplus);       \
+  __macro(Softsign, softsign);       \
+  __macro(LeakyRelu, leaky_relu);    \
+  __macro(TanhShrink, tanh_shrink);  \
+  __macro(ELU, elu);                 \
+  __macro(HardShrink, hard_shrink);  \
+  __macro(Swish, swish);             \
+  __macro(ThresholdedRelu, thresholded_relu);
+
+#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)        \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
+                    ::paddle::operators::OP_NAME##OpMaker,          \
+                    ::paddle::operators::OP_NAME##GradMaker);       \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
+
+#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
+                    ::paddle::operators::OP_NAME##OpMaker,              \
+                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
   REGISTER_OP_CPU_KERNEL(                                                 \
@@ -661,4 +528,6 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
       ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
+FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP);
+FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 4f745553c14fc1391bc65d4f7e4f9bd3b5a881c2..27487b396ccf63d962defa6b270063ccb409164e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 43856780bf9357281ac4af2968950da15426e5c8..912415192659dc004f54a76e9cd1a20581d512a6 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
+#include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -25,6 +28,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+/* Use ugly global variable, for the using in python layer side
+   Please refer to the layer_helper.py and get the details.
+ */
+static std::unordered_set<std::string> InplaceOpSet = {
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
+};
+
+static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
+
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -60,7 +73,6 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
     auto* Out = context.Input<framework::Tensor>("Out");
     auto* dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -68,7 +80,6 @@ class ActivationGradKernel
     dX->mutable_data<T>(context.GetPlace());
 
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenVector<T>::Flatten(*Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto* place =
@@ -78,7 +89,16 @@ class ActivationGradKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(*place, x, out, dout, dx);
+    bool inplace = functor.Inplace();
+    if (!inplace) {
+      auto* X = context.Input<framework::Tensor>("X");
+      auto x = framework::EigenVector<T>::Flatten(*X);
+      functor(*place, x, out, dout, dx);
+    } else {
+      VLOG(10) << " Inplace activation ";
+      auto x = framework::EigenVector<T>::Flatten(*dX);
+      functor(*place, x, out, dout, dx);
+    }
   }
 };
 
@@ -89,6 +109,14 @@ struct BaseActivationFunctor {
   using AttrPair = std::vector<std::pair<const char*, float*>>;
 
   AttrPair GetAttrs() { return AttrPair(); }
+
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
 };
 
 // sigmoid(x) = 1 / (1 + exp(-x))
@@ -102,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -156,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("exp"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -174,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
   }
 };
 
@@ -192,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("tanh"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -297,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sqrt"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -316,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("ceil"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / x;
+    dx.device(d) = static_cast<T>(0) / out;
   }
 };
 
@@ -432,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("reciprocal"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -531,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("relu6"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
-                       .template cast<T>();
+    dx.device(d) =
+        dout *
+        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+            .template cast<T>();
   }
 };
 
@@ -611,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("soft_relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto tmp = static_cast<T>(threshold);
-    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 };
@@ -791,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"slope", &slope}, {"offset", &offset}};
   }
-
+  bool Inplace() { return IsInplace("hard_sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 718f469d38c3c6b7272c1531fae0a1e9ad2e8e3e..4a8dfd4b54227070c2143b180f8ab92753885550 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <string>
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3cc6ed310575473fae8e91a8507fb9146107e841..4cb0457d9285e20d4b6a2f9987b7fdb1c6ac157f 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -87,7 +88,7 @@ struct BeamSearchDecoder {
    */
   std::vector<BeamNodeVector<T>> PackTwoSteps(
       const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<BeamNodeVector<T>>* prefixes_list,
       std::vector<SentenceVector<T>>* sentence_vector_list) const;
 
   /**
@@ -140,7 +141,7 @@ Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
 template <typename T>
 std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
     const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<BeamNodeVector<T>>* prefixes_list,
     std::vector<SentenceVector<T>>* sentence_vector_list) const {
   std::vector<BeamNodeVector<T>> result;
 
@@ -153,7 +154,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
 
     // if prefixes size is 0, it means this is the first step. In this step,
     // all candidate id is the start of candidate sentences.
-    if (prefixes_list.empty()) {
+    if (prefixes_list->empty()) {
       PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
                         cur_ids.lod().at(kSentenceLevel).back(),
                         "in the first step");
@@ -162,7 +163,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
             cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
       }
     } else {
-      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
       SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
 
       PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
@@ -262,7 +263,7 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
   for (size_t step_id = 0; step_id < step_num; ++step_id) {
     beamnode_vector_list =
         PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     beamnode_vector_list, &sentence_vector_list);
+                     &beamnode_vector_list, &sentence_vector_list);
   }
   // append last beam_node to result
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index c3faf46e09bb40d01049fd9cfd79836c1d2bd5bb..36f9594969c416c694928811012baf94332bbd91 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
 
   BeamSearchDecoder<float> helper;
   beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
   ASSERT_EQ(beamnode_vector_list.size(), 2UL);
   ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
   ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
@@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) {
 
   BeamSearchDecoder<float> helper1;
   beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
 
   ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
   ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e848b1f12cb9f1ce1d37e0e0233bfc361dc35a33..fdab4e92f47c7c8f241d93268a73dcb8c2eb2dc6 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_op.h"
 
+#include <algorithm>
 #include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b333ef4e6c73be15dfea2cadb153d2484b3daaf7..0a481a85ce6fbb582b8c0e12710455aaaac72aa1 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #endif
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 2ec984d8e0f07b741f5e36f281134c0469079afd..e910ad92d1051aa89fdb3290a977ff376378a227 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
-            ops::BilinearTensorProductOpGrad);
+REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
+                  ops::BilinearTensorProductOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
     ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
index 844b3ae3b7bf87c9b253128165b3c938801d5d60..25c5c3c95ef6899589c98570df6ecbf9b3241d89 100644
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -29,11 +29,11 @@ namespace paddle {
 namespace operators {
 
 void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable &status_var, bool status) {
+                      framework::Variable *status_var, bool status) {
   auto cpu = platform::CPUPlace();
   auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
+      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                         cpu);
   status_tensor[0] = status;
 }
 
@@ -66,7 +66,7 @@ class ChannelRecvOp : public framework::OperatorBase {
     bool ok = concurrency::ChannelReceive(ch, output_var);
 
     // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
   }
 };
 
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 77d3cffe7c19affe66223363eba26e2d77cdcd43..95440ff89e883e754795c67cd58a08f1131df368 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/chunk_eval_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index 9e97f7c7762ed6bded94be35ae8a094466e0aec0..8631415062db839476e2536a9836e4b9f069a3e2 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <set>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   };
 
   void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>& segments, int num_chunk_types,
+                   std::vector<Segment>* segments, int num_chunk_types,
                    int num_tag_types, int other_chunk_type, int tag_begin,
                    int tag_inside, int tag_end, int tag_single) const {
-    segments.clear();
-    segments.reserve(length);
+    segments->clear();
+    segments->reserve(length);
     int chunk_start = 0;
     bool in_chunk = false;
     int tag = -1;
@@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
             i - 1,        // end
             prev_type,
         };
-        segments.push_back(segment);
+        segments->push_back(segment);
         in_chunk = false;
       }
       if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
@@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
           length - 1,   // end
           type,
       };
-      segments.push_back(segment);
+      segments->push_back(segment);
     }
   }
 
@@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, *num_infer_chunks_data,
-                 *num_label_chunks_data, *num_correct_chunks_data,
+                 &output_segments, &label_segments, num_infer_chunks_data,
+                 num_label_chunks_data, num_correct_chunks_data,
                  num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
                  tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
@@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   }
 
   void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>& output_segments,
-                  std::vector<Segment>& label_segments,
-                  int64_t& num_output_segments, int64_t& num_label_segments,
-                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  std::vector<Segment>* output_segments,
+                  std::vector<Segment>* label_segments,
+                  int64_t* num_output_segments, int64_t* num_label_segments,
+                  int64_t* num_correct, int num_chunk_types, int num_tag_types,
                   int other_chunk_type, int tag_begin, int tag_inside,
                   int tag_end, int tag_single,
                   const std::set<int>& excluded_chunk_types) const {
@@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
                 other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
     size_t i = 0, j = 0;
-    while (i < output_segments.size() && j < label_segments.size()) {
-      if (output_segments[i] == label_segments[j] &&
-          excluded_chunk_types.count(output_segments[i].type) != 1) {
-        ++num_correct;
+    while (i < output_segments->size() && j < label_segments->size()) {
+      if (output_segments->at(i) == label_segments->at(j) &&
+          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
+        ++(*num_correct);
       }
-      if (output_segments[i].end < label_segments[j].end) {
+      if (output_segments->at(i).end < label_segments->at(j).end) {
         ++i;
-      } else if (output_segments[i].end > label_segments[j].end) {
+      } else if (output_segments->at(i).end > label_segments->at(j).end) {
         ++j;
       } else {
         ++i;
         ++j;
       }
     }
-    for (auto& segment : label_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    for (auto& segment : (*label_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_label_segments);
+      }
     }
-    for (auto& segment : output_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    for (auto& segment : (*output_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_output_segments);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index a3b67964c79268e6ce07018501c46163847897ad..c71139fc7c01a696299296e43d06cf195fb3d03f 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
-            ops::ClipOpGrad);
+REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4a36b03cb63ac3ea61be1bbc56b8dd0adbe7d334..3bb3bd4eb15881afb5ae42beb944b76b5e8207cb 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -103,10 +103,12 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-               ops::ConcatOpGrad, false)
+REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index bff2c34ec893d0e6212426b108dd98b0d0d0fb48..137fee99e82e5c7fad58a36ef49adb323f13f3a4 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -47,7 +47,7 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&  // NOLINT
           ips[0]->numel() == 1)) {
       PADDLE_THROW(
           "condition input's data type should be bool, "
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 0a8a5d4c71c4510f04eea2f7ef12f836d1fd9c9b..63d371310d2a26a1460e527fc51923dfd6e0b8bc 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -73,9 +73,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
     auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+        mkldnn::memory({weights_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
     auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
 
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -180,8 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
 
     // create memory
-    auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
-                                          (void*)output_grad_data);
+    auto diff_dst_memory = mkldnn::memory(
+        {diff_weights_md, mkldnn_engine},
+        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
     // Retrieve conv_pd from device context
     auto conv_pd =
         std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
@@ -198,10 +201,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                       mkldnn_engine);
 
       // create memory
-      auto diff_weights_memory = mkldnn::memory(
-          {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto diff_weights_memory =
+          mkldnn::memory({diff_weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(filter_grad_data));
       auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+          mkldnn::memory({src_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(input_data)));
 
       // create backward conv primitive for weights
       auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
@@ -220,10 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                    strides, paddings, *conv_pd, mkldnn_engine);
 
       // create memory
-      auto diff_src_memory =
-          mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto diff_src_memory = mkldnn::memory(
+          {diff_src_md, mkldnn_engine},
+          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
       auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+          mkldnn::memory({weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));
 
       // create backward conv primitive for data
       auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 695db841a4ec666b2c8783dfc7df959711341d85..92748993c32ffb93ae25db8d9916798e657cc804 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -335,14 +335,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 12b45f1d65019f623268cb9da9004bac5e1f72a3..d6f86a5c88e37970379da0afe2a1d46e18b653f4 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
@@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
 
   return output_size;
 }
-inline bool IsExpand(std::vector<int64_t>& filter_dim,
-                     std::vector<int>& strides, std::vector<int>& paddings,
-                     std::vector<int>& dilations) {
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
   bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
   for (size_t j = 0; j < strides.size(); ++j) {
     filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index a1a0b00208fe77ad462062b5d0cb0c5f3065f584..82fdd308207adb159632dbb9decd67fd2d1c4646 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -193,8 +193,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
 REGISTER_OP_CPU_KERNEL(conv_shift,
                        ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 08f5939d42a41d235a94eff16cf2f558068d6aaa..d699dcafa4e2c7e0a3ffb62ec3985e4961fa2133 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -298,8 +298,10 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
@@ -311,8 +313,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
+                  ops::Conv3DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 4c8af408f62453eaf22cc23d19844e8ca7625bfa..04ca878e687f9b8e5239d8c4aad7e5f262fda0fa 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -153,8 +153,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
-            ops::CosSimOpGrad);
+REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
 REGISTER_OP_CPU_KERNEL(
     cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fd7ea70c64fafd0a7ea55ec1e3a29eb66d84a2c6..a8f1fbd529c71d1915c75fa90b7e4e8239d2fa3f 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -153,7 +153,9 @@ class CropOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 55810371c8d354483138b0673721a1ea39fa6f35..0e0622e290f42811c83c354d749ef32a2d9dcadb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -164,8 +164,9 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
                        ops::CrossEntropyOpKernel<double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 0da6f188523a78693929307a08601e04002bc8ec..f7c516a0ba375a68e3adeb44c99f2808dc0418bb 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -79,4 +79,4 @@ using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
 REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 70e2a1de5e24302646611cfea3b8dbe1562274e2..eb5fd99ccb844b1f1717b818e7807a384d6515eb 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -19,4 +19,4 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
                         ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index b6110f92ed4f38a156e0c99ecfb399f3f47a169e..452ff5e967c086340e065a1b6a4b8672c75a4a3d 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -59,15 +59,13 @@ class AsyncGRPCServer final {
 
   void SetProgram(framework::ProgramDesc *program) { program_ = program; }
 
-  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
-
   void SetExecutor(framework::Executor *executor) { executor_ = executor; }
 
   void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
     prefetch_ctx_ = prepared;
   }
 
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() const { return selected_port_; }
 
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
@@ -114,7 +112,6 @@ class AsyncGRPCServer final {
   std::unique_ptr<std::thread> t_get_;
   std::unique_ptr<std::thread> t_prefetch_;
 
-  int prefetch_blk_id_;
   framework::ExecutorPrepareContext *prefetch_ctx_;
   framework::ProgramDesc *program_;
   framework::Executor *executor_;
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 16c612c45a37dd2ffd17f8d5f5946df30e9b3fe6..69fcffe9bc34006aef2e5a39227cf6d947e4615f 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
+        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
         payload = memory::Alloc(cpu, copy_size);
 
         memory::Copy(cpu, payload,
@@ -99,7 +99,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = tensor.data<void>();
       }
-      payload_size = tensor.memory_size();
+      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     case framework::proto::VarType_Type_SELECTED_ROWS: {
@@ -118,7 +118,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
+        auto copy_size =
+            tensor->numel() * framework::SizeOfType(tensor->type());
         payload = memory::Alloc(cpu, copy_size);
         memory::Copy(cpu, payload,
                      boost::get<platform::CUDAPlace>(tensor->place()),
@@ -133,7 +134,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = slr->mutable_value()->data<void>();
       }
-      payload_size = tensor->memory_size();
+      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     default:
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index cb5f89583436b059ac4d6509dac9f2e3868561aa..221d2f4c5b30aef022a5d6b54cd657d1dec1f5a2 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -51,7 +51,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
   std::vector<::grpc::Slice> slices;
@@ -129,7 +129,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
   std::vector<::grpc::Slice> slices;
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index c9d7fd6d1581f6f4182e9e3e0d633c13a3c336a5..fbef8d02a4d765052fccf3792ebe0373d46b1ef6 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -114,8 +114,7 @@ bool VariableResponse::CopyLodTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
   tensor->Resize(dims);
 
   framework::LoD lod;
@@ -151,8 +150,7 @@ bool VariableResponse::CopySelectRowsTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
@@ -174,8 +172,7 @@ bool VariableResponse::CopySelectRowsTensorData(
 bool VariableResponse::CopySelectRowsData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, int length) {
-  auto var = scope_->FindVar(meta_.varname());
-  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->mutable_rows()->resize(length /
                               framework::SizeOfType(typeid(int64_t)));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index 93b0d3cfb4f7d7f336414361773f872d7b259482..3018a5c4af876828380ff4c1cbfdaafa8a2057e1 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -37,10 +37,17 @@ namespace detail {
 class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx)
-      : scope_(scope), dev_ctx_(dev_ctx) {}
-
-  virtual ~VariableResponse() {}
+                   const platform::DeviceContext* dev_ctx,
+                   bool create_scope = false)
+      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
+    if (create_scope) {
+      local_scope_ = &scope->NewScope();
+    }
+  }
+
+  virtual ~VariableResponse() {
+    if (create_scope_) scope_->DeleteScope(local_scope_);
+  }
 
   // return:
   // 0:ok.
@@ -54,11 +61,18 @@ class VariableResponse {
   // other: number of error field.
   int Parse(const ::grpc::ByteBuffer& byte_buffer);
 
+  const framework::Scope& GetLocalScope() const { return *local_scope_; }
+
   inline std::string Varname() { return meta_.varname(); }
   inline std::string OutVarname() { return meta_.out_varname(); }
 
   // should call parse first.
-  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
+  framework::Variable* GetVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.varname());
+    }
+    return scope_->FindVar(meta_.varname());
+  }
 
  private:
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
@@ -75,6 +89,8 @@ class VariableResponse {
  private:
   const framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
+  bool create_scope_ = false;
+  framework::Scope* local_scope_ = nullptr;
   // only Skeleton
   sendrecv::VariableMessage meta_;
 };
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 93ef15b9332168a9c62abfd4d0827207173ece45..38f43b6d031372948bd82c686a2d9ce5f8ecd07c 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection_map_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 8c15bfa36bfe72586cfcbdbd8efc4542253adaca..431812e2bfcf926cadf8d7be6a7d1a79e78c7762 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -82,7 +87,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     std::vector<std::map<int, std::vector<Box>>> gt_boxes;
     std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
 
-    GetBoxes(*in_label, *in_detect, gt_boxes, detect_boxes);
+    GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
 
     std::map<int, int> label_pos_count;
     std::map<int, std::vector<std::pair<T, int>>> true_pos;
@@ -95,20 +100,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     }
 
     if (in_pos_count != nullptr && state) {
-      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
-                  true_pos, false_pos, class_num);
+      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count,
+                  &true_pos, &false_pos, class_num);
     }
 
     CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
-                             overlap_threshold, label_pos_count, true_pos,
-                             false_pos);
+                             overlap_threshold, &label_pos_count, &true_pos,
+                             &false_pos);
 
     int background_label = ctx.Attr<int>("background_label");
     T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
                     background_label);
 
-    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
-                 *out_true_pos, *out_false_pos, class_num);
+    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count,
+                 out_true_pos, out_false_pos, class_num);
 
     T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
     map_data[0] = map;
@@ -155,7 +160,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
 
   void GetBoxes(const framework::LoDTensor& input_label,
                 const framework::LoDTensor& input_detect,
-                std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
+                std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
                 std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
                     detect_boxes) const {
     auto labels = framework::EigenTensor<T, 2>::From(input_label);
@@ -179,7 +184,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           box.is_difficult = true;
         boxes[label].push_back(box);
       }
-      gt_boxes.push_back(boxes);
+      gt_boxes->push_back(boxes);
     }
 
     auto detect_index = detect_lod[0];
@@ -200,9 +205,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::map<int, int>& label_pos_count,
       const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
       const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      framework::Tensor& output_pos_count,
-      framework::LoDTensor& output_true_pos,
-      framework::LoDTensor& output_false_pos, const int class_num) const {
+      framework::Tensor* output_pos_count,
+      framework::LoDTensor* output_true_pos,
+      framework::LoDTensor* output_false_pos, const int class_num) const {
     int true_pos_count = 0;
     int false_pos_count = 0;
     for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
@@ -214,12 +219,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       false_pos_count += fp.size();
     }
 
-    int* pos_count_data = output_pos_count.mutable_data<int>(
+    int* pos_count_data = output_pos_count->mutable_data<int>(
         framework::make_ddim({class_num, 1}), ctx.GetPlace());
 
-    T* true_pos_data = output_true_pos.mutable_data<T>(
+    T* true_pos_data = output_true_pos->mutable_data<T>(
         framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
-    T* false_pos_data = output_false_pos.mutable_data<T>(
+    T* false_pos_data = output_false_pos->mutable_data<T>(
         framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
     true_pos_count = 0;
     false_pos_count = 0;
@@ -261,21 +266,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     framework::LoD false_pos_lod;
     false_pos_lod.emplace_back(false_pos_starts);
 
-    output_true_pos.set_lod(true_pos_lod);
-    output_false_pos.set_lod(false_pos_lod);
+    output_true_pos->set_lod(true_pos_lod);
+    output_false_pos->set_lod(false_pos_lod);
     return;
   }
 
   void GetInputPos(const framework::Tensor& input_pos_count,
                    const framework::LoDTensor& input_true_pos,
                    const framework::LoDTensor& input_false_pos,
-                   std::map<int, int>& label_pos_count,
-                   std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-                   std::map<int, std::vector<std::pair<T, int>>>& false_pos,
+                   std::map<int, int>* label_pos_count,
+                   std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+                   std::map<int, std::vector<std::pair<T, int>>>* false_pos,
                    const int class_num) const {
     const int* pos_count_data = input_pos_count.data<int>();
     for (int i = 0; i < class_num; ++i) {
-      label_pos_count[i] = pos_count_data[i];
+      (*label_pos_count)[i] = pos_count_data[i];
     }
 
     auto SetData = [](const framework::LoDTensor& pos_tensor,
@@ -291,8 +296,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       }
     };
 
-    SetData(input_true_pos, true_pos);
-    SetData(input_false_pos, false_pos);
+    SetData(input_true_pos, *true_pos);
+    SetData(input_false_pos, *false_pos);
     return;
   }
 
@@ -301,9 +306,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
           detect_boxes,
       bool evaluate_difficult, float overlap_threshold,
-      std::map<int, int>& label_pos_count,
-      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+      std::map<int, int>* label_pos_count,
+      std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
     int batch_size = gt_boxes.size();
     for (int n = 0; n < batch_size; ++n) {
       auto image_gt_boxes = gt_boxes[n];
@@ -320,10 +325,10 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           continue;
         }
         int label = it->first;
-        if (label_pos_count.find(label) == label_pos_count.end()) {
-          label_pos_count[label] = count;
+        if (label_pos_count->find(label) == label_pos_count->end()) {
+          (*label_pos_count)[label] = count;
         } else {
-          label_pos_count[label] += count;
+          (*label_pos_count)[label] += count;
         }
       }
     }
@@ -338,8 +343,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           int label = it->first;
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
         continue;
@@ -351,8 +356,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
         if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
           continue;
         }
@@ -381,17 +386,17 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
                 (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
             if (match_evaluate_difficult) {
               if (!visited[max_idx]) {
-                true_pos[label].push_back(std::make_pair(score, 1));
-                false_pos[label].push_back(std::make_pair(score, 0));
+                (*true_pos)[label].push_back(std::make_pair(score, 1));
+                (*false_pos)[label].push_back(std::make_pair(score, 0));
                 visited[max_idx] = true;
               } else {
-                true_pos[label].push_back(std::make_pair(score, 0));
-                false_pos[label].push_back(std::make_pair(score, 1));
+                (*true_pos)[label].push_back(std::make_pair(score, 0));
+                (*false_pos)[label].push_back(std::make_pair(score, 1));
               }
             }
           } else {
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
       }
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index e4436549f6185ba04a5f270893596a6dcb11e89b..4ed1b548840fabd2383632beb5f35fa6aa096443 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -101,8 +101,9 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
-            ops::DropoutOpGrad);
+REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 184c095e487a302ebc4d251dd6f332333c415c6d..1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -32,14 +32,26 @@ __global__ void RandomGenerator(const size_t n, const int seed,
   thrust::uniform_real_distribution<float> dist(0, 1);
 
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  T mask;
+  T dest;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
-    rng.discard(idx);
+    T s = src[idx];
+    if (step_size == 0) {
+      rng.discard(idx);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      rng.discard(step_size);
+    }
     if (dist(rng) < dropout_prob) {
-      mask_data[idx] = static_cast<T>(0);
+      mask = static_cast<T>(0);
     } else {
-      mask_data[idx] = static_cast<T>(1);
+      mask = static_cast<T>(1);
     }
-    dst[idx] = mask_data[idx] * src[idx];
+    dest = s * mask;
+    mask_data[idx] = mask;
+    dst[idx] = dest;
   }
 }
 
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 3b89ad5d49c339cf05abc0f8577e895f30dddfd4..913a9145420dae7c4f6a4df10c0330636b5796b0 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 6f9a090c8ea660d023acece096b48d29aa2f35f7..c7ddafcad1d1f6c14791fde665f43881d6b49836 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -30,8 +30,10 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
+                  ops::ElementwiseDivOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
index 61da7c59441df22d71316b13f131399d3cd55f3a..a4fe386bb1907bf7c0099d2b1109077b21146948 100644
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
+                  ops::ElementwiseMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
index c74ff36db17579182e3c7e93a5adc5fe79fbcadd..68cd6ddb4a938b2b1c33e3f89c6d1151acb27f48 100644
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMinOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
+                  ops::ElementwiseMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 5d7f2cdffd11dfef8df22175dd0570b277c0e13a..2dec27136ad57ea032d5abb51799bd04ccc0b2e3 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -31,8 +31,10 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
+                  ops::ElementwiseMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index 6f770820c80310a183018b586cb7545ca1e9de51..9d0598fc39a3922fa830f18729d90a7dac6a890b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -29,8 +29,10 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
+                  ops::ElementwiseSubOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 51a66bd832fbdface953d9b7b509b32ce26d33ca..4ae91d074d3df8b910a7f5d816a22b6f1d51dff6 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -128,8 +129,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
-            ops::ExpandGradOp);
+REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 2c2d5c7c42c0cc918199eff054d1656f01a281e8..75dbf1d8bf5cb692dcf7b88e9f4c486ab3839701 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -14,13 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include <boost/preprocessor/arithmetic/div.hpp>
 #include <boost/preprocessor/arithmetic/mod.hpp>
 #include <boost/preprocessor/comparison/greater.hpp>
 #include <boost/preprocessor/comparison/greater_equal.hpp>
 #include <boost/preprocessor/control/if.hpp>
 #include <boost/preprocessor/repetition/repeat.hpp>
-#include <iostream>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 381771f157d78fb04e54f0a07c40e4df2c91441a..45e4d5b2b863a55ae0aa0414ff8697141fd2aa6f 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -98,5 +98,6 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
-            paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6be06b8816ce65641b49d7b7b3861cdd8460feaa..4c82f5c429038504d9876ee240a705911feb0b7a 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -100,7 +100,8 @@ Out = [[3, 4],
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
+REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 3819549c7112c5e4a6de1a9aee54e469dd5a4618..7e014dd1cb47ee0575308dc13ba7bc7617baebff 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 5a8b1ebbe3fe5f242a4d6395c921c75247587c6a..2dd726bebb1bc2e4d83844c0b98df01c390e622f 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 7625bd45d968720099a973a6988484ec8332d1c1..9c0561b016fdbfa8e48535eaa673a3f85bc936e5 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,38 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
 
-TEST(Gather, GatherData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/platform/place.h"
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+TEST(Gather, GatherData) {
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   int* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+  p_src = src->mutable_data<int>(paddle::framework::make_ddim({3, 4}),
+                                 paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({2}),
+                                     paddle::platform::CPUPlace());
 
   for (int i = 0; i < 12; ++i) p_src[i] = i;
   p_index[0] = 1;
   p_index[1] = 0;
 
-  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+  int* p_output = output->mutable_data<int>(
+      paddle::framework::make_ddim({2, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, *src, *index, output);
+  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index 9002ce4717c6e75e7204ef62094e4680bba3f88b..0d7219ac5c624236b85916d5faf6810dbed2198a 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
index 58fe32446217e07235b40b9b78190094e57e4951..b8e1556c23a3b7357ed56d1b83c09622559040a4 100644
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2490b83b8c50ce4a68095be10d78a380174c1a3f..0a524c914d305661745c5d85cbbee2edb57c97ba 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -216,7 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
 REGISTER_OP_CPU_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index f4c766db0a12b9d2167b0ee3b1d7666c4f1813f1..f8d1d44b5423dd09fe5aad11434911af6f14fe77 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -124,7 +124,7 @@ $$
 
 which is same as one time step of GRU Operator.
 
-@note To implement the complete GRU unit, fully-connected operator must be 
+@note To implement the complete GRU unit, fully-connected operator must be
 used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
@@ -194,12 +194,45 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class GRUUnitGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("gru_unit_grad");
+
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("HiddenPrev", Input("HiddenPrev"));
+    op->SetInput("Weight", Input("Weight"));
+    op->SetInput("Bias", Input("Bias"));
+
+    op->SetInput("Hidden", Output("Hidden"));
+    op->SetInput("Gate", Output("Gate"));
+    op->SetInput("ResetHiddenPrev", Output("ResetHiddenPrev"));
+    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("HiddenPrev"),
+                  InputGrad("HiddenPrev"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
-            ops::GRUUnitGradOp);
+
+REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
+                  ops::GRUUnitGradOpMaker);
+REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp);
+
 REGISTER_OP_CPU_KERNEL(
     gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index efe84f14098028675cb332efd9545c9709528cb3..086b5a97dec9a3d5b8f91b802b92d64ca73bf57c 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -103,8 +103,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
     ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 134b23b4612b478f9aeb06454c9fd9a6c25fffb4..74d8e0e2b76adc7a3e69649f277a8c0df6f38056 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -121,8 +121,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss,
     ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 5b387d8d344dfc3475a537827acd9e125fe6693c..8c120eec86601146500721bbb4249bc458190093 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -148,8 +148,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
     im2sequence,
     ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index ec2e641679fedec776d48716f13445f44375ce3d..d8c97b27b328b1470bece4a6c1872b5ccc75115e 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/increment_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -89,4 +90,4 @@ REGISTER_OP_CPU_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
index 7fb6425fe994751c4d7a025bb62e43a84c8d95c2..228063bf3d4b24bbd03649189f6ddba9a5f0ca30 100644
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/operators/increment_op.cu
@@ -19,4 +19,4 @@ REGISTER_OP_CUDA_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 963b0587c386c72c05f8cc5d0b63074e9e726579..0c143b7c8aed13a202e2597632d17d8bccc8b66d 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -67,8 +67,9 @@ $$Out = \sum{|X|}$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
-            ops::L1NormGradOp);
+REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index c2a8c7f867a4483a7fda2f4336a64ab109ce86e8..a73c626032f3bf6e97ac5974424e76bacb9a0799 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -117,8 +117,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
 REGISTER_OP_CPU_KERNEL(
     label_smooth,
     ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 88b3b08af57eaf2d1086d778e3313c3dea6300fb..de1056aef7bfa2f53f8a92b262e7d15aa7c2b75c 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ef568a578b0b97ea402a2a521f0fe1431013d1b7..2f29e377fdada918f2c9dca8c2d94eb06278320d 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -256,8 +256,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
+                  ops::LinearChainCRFOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
     ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 5d293665f0bcc098126ad3ec6c9bf34ff54c3b6f..af235fb6a029a71ee275bebfbbd75aaa0b7d546d 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <fstream>
 #include <ostream>
 #include <thread>  // NOLINT
 #include <vector>
@@ -26,20 +27,6 @@ void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
   VLOG(4) << "RunServer thread end";
 }
 
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
-
 static void ParallelExecuteBlocks(
     const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
     const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
@@ -61,13 +48,20 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
+static void SavePort(std::shared_ptr<detail::AsyncGRPCServer> rpc_service) {
+  std::ofstream port_file;
+  port_file.open("/tmp/paddle.selected_port");
+  port_file << rpc_service->GetSelectedPort();
+  port_file.close();
+}
+
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
-int ListenAndServOp::GetSelectedPort() {
+int ListenAndServOp::GetSelectedPort() const {
   return rpc_service_->GetSelectedPort();
 }
 
@@ -76,55 +70,26 @@ void ListenAndServOp::Stop() {
   server_thread_->join();
 }
 
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  if (!rpc_service_) {
-    std::string endpoint = Attr<std::string>("endpoint");
-    rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-  }
-
-  auto ins = Inputs("X");
+void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
+                                  framework::ProgramDesc *program,
+                                  framework::Scope *recv_scope,
+                                  framework::BlockDesc *prefetch_block) const {
   auto fan_in = Attr<int>("Fanin");
-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
-  auto *program = optimize_block->Program();
+
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    if (blkid != prefetch_block->ID()) {
-      block_list.push_back(blkid);
-    }
+    block_list.push_back(blkid);
   }
-  auto optimize_prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, block_list);
   // Insert placeholder for block0 which holds current op itself.
   optimize_prepared.insert(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  // TODO(qiao) set proper fields for table lookup and update
-  rpc_service_->SetExecutor(&executor);
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
-  prefetch_prepared.release();
-  rpc_service_->SetProgram(program);
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  // FIXME(typhoonzero): do we need to wait until the server port is ready?
-  sleep(5);
-
-  // TODO(typhoonzero): change this to a while_op for every cluster-batch.
   bool exit_flag = false;
   // Record received sparse variables, so that
   // we could reset those after execute optimize program
@@ -165,7 +130,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       break;
     }
 
-    // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
 
     // The optimize blocks which have the same parent ID would run parallel
@@ -175,18 +140,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     parallel_blkids.push_back(1);
     double ts = detail::GetTimestamp();
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (blkid != prefetch_block->ID()) {
+      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
         if (program->Block(blkid).Parent() != last_parent_blkid) {
-          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-                                program, &recv_scope);
+          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                                program, recv_scope);
           parallel_blkids.clear();
           last_parent_blkid = program->Block(blkid).Parent();
         }
         parallel_blkids.push_back(blkid);
       }
     }
-    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-                          program, &recv_scope);
+    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
+                          recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
     // Reset the received sparse variables, the sum operator would not
@@ -204,6 +169,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   }  // while(true)
 }
 
+void ListenAndServOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+
+  PADDLE_ENFORCE(!rpc_service_);
+  std::string endpoint = Attr<std::string>("endpoint");
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
+  framework::Executor executor(dev_place);
+
+  // prepare rpc_service
+  rpc_service_->SetScope(&recv_scope);
+  rpc_service_->SetDevCtx(&dev_ctx);
+  rpc_service_->SetProgram(program);
+  rpc_service_->SetExecutor(&executor);
+
+  // prepare for prefetch
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
+
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  VLOG(3) << "wait server thread to become ready...";
+  sleep(5);
+  // Write to a file of server selected port for python use.
+  SavePort(rpc_service_);
+  RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+}
+
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 759b2a462ba5b938991aa86be9b9dc3e59fe3f7e..dfb7c77c8e36d9af79d8b1713d0c0c59c81b1ca6 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -34,17 +34,22 @@ void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
-  ListenAndServOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs);
+  ListenAndServOp(const std::string& type,
+                  const framework::VariableNameMap& inputs,
+                  const framework::VariableNameMap& outputs,
+                  const framework::AttributeMap& attrs);
 
-  int GetSelectedPort();
+  int GetSelectedPort() const;
+
+  void RunSyncLoop(framework::Executor* executor,
+                   framework::ProgramDesc* program,
+                   framework::Scope* recv_scope,
+                   framework::BlockDesc* prefetch_block) const;
 
   void Stop() override;
 
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override;
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override;
 
  protected:
   mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 7d5687f2d0666d393d7bb1c1a2fdde6c95e6d615..92ebfc274b84f738f5bd688a9a6d9f437b6318aa 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -155,8 +155,9 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
-            ops::LoDResetGradOp);
+REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
     ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index f44996d8ac746a33750a979eff2cbbc84e10214b..a8258a1afd70574c174abe8d5630ade5d4ac3de6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -106,8 +106,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
-            ops::LogLossGradOp);
+REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index cb088c267bcc028ff11583cd73de5ca1722a9b69..d482506bf0361c11a019e32efbf348a64aaf5164 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -103,7 +103,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.index(ids[i]);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
           memcpy(output + i * row_width, table + id_index * row_width,
                  row_width * sizeof(T));
         }
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 553a06c3dcdbb9de43afcace75ebec7c5e819d4a..f5c0e47fda913b4635833c31496644b60a0a8504 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -276,7 +276,9 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
 REGISTER_OP_CPU_KERNEL(
     lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index e062d62c66c25e386c7643e310034bc1481ec43d..084ee1cfe602af3622ef2a3f35f2892d5540cec7 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -273,7 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index b3c9d7c34d1ac54fb3e15a60bcc470f392bf5027..e1157ef6c640be17e7f48abe1ab972cf88504526 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -97,8 +97,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            ops::LstmUnitGradOp);
+REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 82541517e122d5da2674b55561ba72af970a2567..f9261323f0f50c78b3b4b66a9fa8abcdf5ba27e9 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -322,8 +322,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
-            ops::LSTMPGradOp);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b146b5088321efcee5a4511b3fedd047a0d54f00..0b41a3e1ffdb32d248bb55651aba242336307e74 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -111,9 +111,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
-            ops::MarginRankLossGradOp);
+REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
+                  ops::MarginRankLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
     ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 1f5255887391218b766aa23842e443c8b2ad080f..e5d33fbc36438f97ff5b604e4efdbfbfa91fcee4 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -237,8 +237,9 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
-            ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 4e28d98834d27351be99106d6760eae46baf8938..e2bcba5a5e15d4d5f10ae4ae64b5262f750137ab 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -101,8 +101,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-            ops::MaxOutOpGrad);
+REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
index 083d03ebe610521c5a4beb7b977a8179700bcf40..f26a165b5a59f01f864d62bbf798f4cbffa65371 100644
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -60,7 +60,7 @@ class MKLDNNActivationGradKernel
   }
 };
 
-namespace {
+namespace {  // NOLINT
 framework::OpKernelType GetKernelType(
     const framework::ExecutionContext& ctx,
     const framework::OperatorWithKernel& oper) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index a8fbd48c4da5b2d0585688e3100f9fe62ac5aa1f..3a0fc74584391d0441105a8ac7d7ac292e10fb8d 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -108,9 +108,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
-            ops::ModifiedHuberLossGradOp);
+REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
+                  ops::ModifiedHuberLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 5038287527c70d376d8c8a1cc8e4cca0b563126a..bfb20fefba2b8d6e95750c6dc2bc44d606d2ddd1 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -160,7 +160,9 @@ class MulGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index 0f80f752c95e97ed4d6d299788734de9d29713db..a12b975326519c776c9f4a1d9f2894b4028c2440 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -173,8 +173,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
   void MultiClassNMS(const framework::ExecutionContext& ctx,
                      const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>& indices,
-                     int& num_nmsed_out) const {
+                     std::map<int, std::vector<int>>* indices,
+                     int* num_nmsed_out) const {
     int64_t background_label = ctx.Attr<int>("background_label");
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
     int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
@@ -189,15 +189,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       if (c == background_label) continue;
       Tensor score = scores.Slice(c, c + 1);
       NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &(indices[c]));
-      num_det += indices[c].size();
+              &((*indices)[c]));
+      num_det += (*indices)[c].size();
     }
 
-    num_nmsed_out = num_det;
+    *num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : indices) {
+      for (const auto& it : *indices) {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
@@ -220,13 +220,13 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
       }
-      new_indices.swap(indices);
-      num_nmsed_out = keep_top_k;
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
     }
   }
 
   void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                        std::map<int, std::vector<int>>& selected_indices,
+                        const std::map<int, std::vector<int>>& selected_indices,
                         Tensor* outs) const {
     int predict_dim = scores.dims()[1];
     auto* scores_data = scores.data<T>();
@@ -273,7 +273,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       std::map<int, std::vector<int>> indices;
       int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index ad623e1fe0f8941615b671a0c20bd3637ae6d407..8de974bc2b333fb6ccc5b5f0bb1af86533139925 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -135,8 +135,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
-          comm->comms().at(idx), ctx.cuda_device_context().stream()));
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
+          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
+          ctx.cuda_device_context().stream()));
       VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 99f38529bbb5a36cd944a01940b5579195f2d601..192bdf8ea553f3a82066f8562458d286ee15a6ee 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/nce_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -179,7 +181,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                        ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 94207638473374ddf7e23d211d6cde93f112f492..2c4c97f28bc0b511d6eaa8f79a3a4efc9be8a5da 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -108,7 +109,7 @@ class NCEKernel : public framework::OpKernel<T> {
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
               .sum();
       sample_out_data[i] += result(0);
@@ -190,7 +191,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
-            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
       }
     }
@@ -202,7 +203,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+        d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
     }
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5345c5bdb0f1e2d96233595f89028993606d2399..30a991224fa184257a8e59af5e6a27a0b0a4da86 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -85,8 +85,9 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
-            ops::NormOpGrad);
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
 REGISTER_OP_CPU_KERNEL(
     norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index b144ec5f7d315cb340dcd94b4a519bfcfd2a0e66..f2de075e0d82fc5bd0ac41b481ac80314f3857a3 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -333,18 +333,20 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4df0a14577ca13ddd79424fc324eb689913b20a0..848cd61b23c2389d3fe11f585b256d55c1ff177f 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -258,9 +258,10 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool2dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
@@ -272,11 +273,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
 
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool3dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
@@ -288,4 +290,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
index 5fc418b6fdd19eddfd27b4a1b3e2554d7b2f37e6..5497dcbd9ce255f833df24989d7a76c40bcbca06 100644
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -27,7 +27,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
 
 REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
@@ -40,4 +40,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 8eaa12a4a6cfc09fd4e2c3642bc8825fe2af6d6b..a066b3e06e5eca2661827425b5b2d0059d5bcc3c 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -83,8 +83,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
-            ops::PReluGradOp);
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
 REGISTER_OP_CPU_KERNEL(
     prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index fc09b4aa1da87e56678790785467e9f4080a20ea..fafc7e54d7a44d6bb2dadf67135537dc16430e76 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -23,15 +23,15 @@ namespace operators {
 
 #define CLOG std::cout
 
-const std::string kForward = "FORWARD";
-const std::string kBackward = "BACKWARD";
-const std::string kBoth = "BOTH";
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
 
 struct Formater {
   std::string message;
   std::string name;
   std::vector<int> dims;
-  std::type_index dtype{typeid(char)};
+  std::type_index dtype{typeid(const char)};
   framework::LoD lod;
   int summarize;
   void* data{nullptr};
@@ -62,7 +62,7 @@ struct Formater {
     }
   }
   void PrintDtype() {
-    if (dtype.hash_code() != typeid(char).hash_code()) {
+    if (dtype.hash_code() != typeid(const char).hash_code()) {
       CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
@@ -83,15 +83,15 @@ struct Formater {
   void PrintData(size_t size) {
     PADDLE_ENFORCE_NOT_NULL(data);
     // print float
-    if (dtype.hash_code() == typeid(float).hash_code()) {
+    if (dtype.hash_code() == typeid(const float).hash_code()) {
       Display<float>(size);
-    } else if (dtype.hash_code() == typeid(double).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const double).hash_code()) {
       Display<double>(size);
-    } else if (dtype.hash_code() == typeid(int).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const int).hash_code()) {
       Display<int>(size);
-    } else if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const int64_t).hash_code()) {
       Display<int64_t>(size);
-    } else if (dtype.hash_code() == typeid(bool).hash_code()) {
+    } else if (dtype.hash_code() == typeid(const bool).hash_code()) {
       Display<bool>(size);
     } else {
       CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
@@ -100,7 +100,7 @@ struct Formater {
 
   template <typename T>
   void Display(size_t size) {
-    auto* d = (T*)data;
+    auto* d = reinterpret_cast<T*>(data);
     CLOG << "\tdata: ";
     if (summarize != -1) {
       summarize = std::min(size, (size_t)summarize);
@@ -135,7 +135,7 @@ class TensorPrintOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     const framework::Variable* in_var_ptr = nullptr;
-    std::string phase = kForward;
+    std::string phase(kForward);
     std::string printed_var_name = "";
 
     auto& inputs = Inputs();
@@ -146,7 +146,7 @@ class TensorPrintOp : public framework::OperatorBase {
                !Inputs("In@GRAD").empty()) {
       in_var_ptr = scope.FindVar(Input("In@GRAD"));
       printed_var_name = Inputs("In@GRAD").front();
-      phase = kBackward;
+      phase = std::string(kBackward);
     } else {
       PADDLE_THROW("Unknown phase, should be forward or backward.");
     }
@@ -163,7 +163,7 @@ class TensorPrintOp : public framework::OperatorBase {
     out_tensor.set_lod(in_tensor.lod());
 
     std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != kBoth) {
+    if (print_phase != phase && print_phase != std::string(kBoth)) {
       return;
     }
 
@@ -199,7 +199,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.lod = printed_tensor.lod();
     }
     formater.summarize = Attr<int>("summarize");
-    formater.data = (void*)printed_tensor.data<void>();
+    formater.data = reinterpret_cast<void*>(printed_tensor.data<void>());
     formater(printed_tensor.numel());
   }
 
@@ -223,8 +223,9 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
         "print_phase",
         "(string, default 'BOTH') Which phase to display including 'FORWARD' "
         "'BACKWARD' and 'BOTH'.")
-        .SetDefault(kBoth)
-        .InEnum({kForward, kBackward, kBoth});
+        .SetDefault(std::string(kBoth))
+        .InEnum({std::string(kForward), std::string(kBackward),
+                 std::string(kBoth)});
     AddOutput("Out", "Output tensor with same data as input tensor.");
     AddComment(R"DOC(
 Creates a print op that will print when a tensor is accessed.
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index a1127f11a75e54168ca9682a0189255d37ee8571..eb9ff8de3e4b37ef0bbf7477c1bb62856bdb6310 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -121,8 +121,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
-            ops::RankLossGradOp);
+REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
index 7879367830216cdd875f9f95f95e2a88f282ac64..093db966472cf100b2f1e4159ce20399cee1f481 100644
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_op.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -122,18 +125,18 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
  protected:
   std::string comment_;
 
-  void Replace(std::string &src, std::string from, std::string to) {
+  void Replace(std::string *src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
+    Replace(&comment_, "{ReduceOp}", name);
+    Replace(&comment_, "{reduce}", op);
   }
 };
 
@@ -187,20 +190,25 @@ class ReduceProdOpMaker : public ReduceOpMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
-            reduce_prod_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
 
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
   REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index b28dd7f20968d762ffd669557500f788bda0d7bc..e42b4bfe42df05346020d4f48519fecf39aa37d2 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -35,77 +35,77 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.sum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
   }
 };
 
 struct SumGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim);
+    dx->device(place) = dy->broadcast(dim);
   }
 };
 
 struct MeanFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.mean(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
   }
 };
 
 struct MeanGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
   }
 };
 
 struct MaxFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.maximum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
   }
 };
 
 struct MinFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.minimum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto equals = x == y.broadcast(dim);
-    auto ones = dx.constant(1);
-    auto zeros = dx.constant(0);
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
     // If there are multiple minimum or maximum elements, the subgradient of
     // each is the set [0, 1], and we pass gradient to all of them here.
-    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
   }
 };
 
 struct ProdFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.prod(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
   }
 };
 
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) * y.broadcast(dim) * x.inverse();
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
   }
 };
 
@@ -125,7 +125,7 @@ class ReduceKernel : public framework::OpKernel<T> {
           *context.template device_context<DeviceContext>().eigen_device();
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       Functor functor;
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
       switch (rank) {
@@ -178,10 +178,10 @@ class ReduceKernel : public framework::OpKernel<T> {
 
     if (D == 1) {
       auto out = EigenScalar<T>::From(*output);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     }
   }
 };
@@ -206,7 +206,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       auto broadcast_dim =
           Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
       Functor functor;
-      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
@@ -258,7 +258,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
   }
 };
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 93f9c74b809770136d3d3300e0e0700b1bc0459e..5e5ccc3ded95d57dfed37c1ac9c7eae61d36b8c0 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -113,8 +113,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
-            ops::ReshapeGradOp);
+REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
 REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
                        ops::ReshapeKernel<CPU, double>,
                        ops::ReshapeKernel<CPU, int>,
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 6d4861f0428834b1893c3a10a83920f0a62b5455..224ec93d28ec75c52848d7c8400e684df0d69209 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -153,8 +153,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
-            ops::ROIPoolGradOp);
+REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d34beeb6508084f4d680fad9bac99ea474d274d3..23f720da0b68cd2fd4c9b51182bf82f72078a906 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -250,8 +250,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
-            ops::RowConvGradOp);
+REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 286f75df4ca2daff24b696c6bcb0c3df32875875..2773c32a0a10269e28c24e12527711e3c5b8f869 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -23,17 +23,17 @@ USE_NO_KERNEL_OP(load_combine);
 
 int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
                             std::string var_name,
-                            paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope& scope,
-                            paddle::framework::LoD& expect_lod) {
-  auto var = scope.Var(var_name);
+                            const paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope* scope,
+                            paddle::framework::LoD* expect_lod) {
+  auto var = scope->Var(var_name);
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({x, y});
-  expect_lod.resize(1);
+  expect_lod->resize(1);
   for (size_t i = 0; i < lod_info.size(); i++) {
-    expect_lod[0].push_back(lod_info[i]);
+    (*expect_lod)[0].push_back(lod_info[i]);
   }
-  tensor->set_lod(expect_lod);
+  tensor->set_lod(*expect_lod);
   int* expect = tensor->mutable_data<int>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<int>(i);
@@ -42,17 +42,17 @@ int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
 }
 
 paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope& scope) {
-  auto load_var = scope.Var(out_var_name);
+    const std::string out_var_name, paddle::framework::Scope* scope) {
+  auto load_var = scope->Var(out_var_name);
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   return target;
 }
 
 int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 paddle::framework::Scope& scope,
-                                 paddle::framework::LoD& actual_lod) {
+                                 const paddle::framework::Scope& scope,
+                                 paddle::framework::LoD* actual_lod) {
   int* actual = target->data<int>();
-  actual_lod = target->lod();
+  *actual_lod = target->lod();
   return actual;
 }
 
@@ -78,26 +78,26 @@ TEST(SaveLoadCombineOp, CPU) {
   std::vector<int> lod1 = {0, 1, 2, 3, 10};
   int numel1 = 100;
   paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
-                                        expect_lod1);
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place,
+                                        &scope, &expect_lod1);
 
   std::vector<int> lod2 = {0, 2, 5, 10};
   int numel2 = 200;
   paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
-                                        expect_lod2);
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place,
+                                        &scope, &expect_lod2);
 
   std::vector<int> lod3 = {0, 2, 3, 20};
   int numel3 = 4000;
   paddle::framework::LoD expect_lod3;
   int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        scope, expect_lod3);
+                                        &scope, &expect_lod3);
 
   std::vector<int> lod4 = {0, 1, 20};
   int numel4 = 1000;
   paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
-                                        expect_lod4);
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place,
+                                        &scope, &expect_lod4);
 
   // Set attributes
   std::string filename = "check_tensor.ls";
@@ -111,10 +111,10 @@ TEST(SaveLoadCombineOp, CPU) {
   save_combine_op->Run(scope, place);
 
   // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
 
   // Run the load_combine_op
   auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
@@ -123,10 +123,10 @@ TEST(SaveLoadCombineOp, CPU) {
   load_combine_op->Run(scope, place);
 
   paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4);
 
   CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
   CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index d6fd6214711f4ee66b1daffa4db2e84aa7201e79..95b12455ea4996f00bab8a353ccd425b2c37aed1 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -102,7 +102,8 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
+REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
index c0bf0ff927481bc4da9cd6c4bb9b0c4a6841c891..876d8acf0d880a7ef806514014d297f98e04c53d 100644
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <boost/tokenizer.hpp>
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -22,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concurrency/channel_util.h"
 
+#include <boost/tokenizer.hpp>
+
 namespace paddle {
 namespace operators {
 
@@ -254,8 +255,8 @@ class SelectOp : public framework::OperatorBase {
       auto selectCond = std::make_shared<std::condition_variable_any>();
 
       std::recursive_mutex callbackMutex;
-      pushThreadOnChannelQueues(scope, cases, selectCond, caseToExecute,
-                                completed, callbackMutex);
+      pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
+                                &completed, &callbackMutex);
 
       // TODO(thuan): Atomically unlock all channels and sleep current thread
       unlockChannels(channels);
@@ -302,8 +303,8 @@ class SelectOp : public framework::OperatorBase {
       const framework::Scope *scope,
       std::vector<std::shared_ptr<SelectOpCase>> *cases,
       std::shared_ptr<std::condition_variable_any> rCond,
-      std::atomic<int> &caseToExecute, std::atomic<bool> &completed,
-      std::recursive_mutex &callbackMutex) const {
+      std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
+      std::recursive_mutex *callbackMutex) const {
     std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
     while (it != cases->end()) {
       std::shared_ptr<SelectOpCase> c = *it;
@@ -315,17 +316,17 @@ class SelectOp : public framework::OperatorBase {
       std::function<bool(framework::ChannelAction channelAction)> cb =
           [&caseToExecute, &completed, &callbackMutex,
            c](framework::ChannelAction channelAction) {
-            std::lock_guard<std::recursive_mutex> lock{callbackMutex};
+            std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
 
             bool canProcess = false;
-            if (!completed) {
+            if (!(*completed)) {
               // If the channel wasn't closed, we set the caseToExecute index
               // as this current case
               if (channelAction != framework::ChannelAction::CLOSE) {
-                caseToExecute = c->caseIndex;
+                *caseToExecute = c->caseIndex;
               }
               // This will allow our conditional variable to break out of wait
-              completed = true;
+              *completed = true;
               canProcess = true;
             }
 
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 3bf5d57809019d3ae469471c2ee2e7aac70b9faf..81350fee38df058d1b63eb5a8cd0b770e0626ae4 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -127,7 +127,7 @@ void StartServerNet(bool is_sparse) {
   const auto &root_block = program.Block(0);
   auto *optimize_block = program.AppendBlock(root_block);
   auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensers, must be of same shape.
+  // X for server side tensors, RX for received tensors, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
   f::AttributeMap attrs;
@@ -139,7 +139,6 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"PrefetchBlock", prefetch_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  LOG(INFO) << "selected port before run " << selected_port;
   listen_and_serv_op->Run(scope, place);
   LOG(INFO) << "server exit";
 }
@@ -158,16 +157,13 @@ TEST(SendRecvOp, CPUDense) {
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
                       listen_and_serv_op.get())
                       ->GetSelectedPort();
-  LOG(INFO) << "selected port " << selected_port;
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
   auto send_op = f::OpRegistry::CreateOp(
       "send", {{"X", {"x1"}}},
       {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
-  LOG(INFO) << "before run " << endpoint;
   send_op->Run(scope, place);
-  LOG(INFO) << "end run";
 
   auto in_var = scope.Var("x1");
   auto tensor = in_var->GetMutable<f::LoDTensor>();
@@ -180,7 +176,6 @@ TEST(SendRecvOp, CPUDense) {
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
-  LOG(INFO) << "before stop";
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset(nullptr);
@@ -199,7 +194,6 @@ TEST(SendRecvOp, CPUSparse) {
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
                       listen_and_serv_op.get())
                       ->GetSelectedPort();
-  LOG(INFO) << "selected port " << selected_port;
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 126753edd09e8bd0f9d5a08936afbc6326b29ace..3c21903e3a08dcfb55c6c07370a117d0ad633e69 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -124,9 +124,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
-               ops::SequenceConcatGradOp, false);
+REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
+                  ops::SequenceConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index ec1f3a5da8c1fc8933b3720802ea901695195dec..94f4b49b0018fdbff6e67c3c081aa5706ccb2e66 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_conv_op.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace operators {
 
@@ -174,8 +176,9 @@ context_length, context_stride and context_start.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
+REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index ae52849162ae4d78cc69ddbb98f58059f55683cb..84a35d7172a567a3f6505559fa45a32290288533 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -200,8 +200,10 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
+                  ops::SequenceExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
     ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
index d09e5bca56b226100d2d0cf3a030c77703bfa76e..7cd620af07fa9b5f8fcee3c0f88207ef2800c4a1 100644
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -120,8 +120,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
+                  ops::SequenceSliceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 5661f4b42f37fed7f589c515e25fd66cfcede2c7..0ddacb57106c090e8f4f9350a65a30ca102f8e0a 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -99,7 +99,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxCUDNNKernel<float>,
-                   ops::SequenceSoftmaxCUDNNKernel<double>)
+                   ops::SequenceSoftmaxCUDNNKernel<double>);
 REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxGradCUDNNKernel<float>,
-                   ops::SequenceSoftmaxGradCUDNNKernel<double>)
+                   ops::SequenceSoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index d2c1317bef95deca36f7f4198407f5350a1be035..a0d47c12ba606eb62bbbea4d5ea793ce915e8100 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -155,9 +155,10 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
-            ops::SequenceSoftmaxGradOp);
+REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
+                  ops::SequenceSoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
index 57adea3a1b9dbcbb5787d005e4d3ec595f61d4b2..397df75415691e4f53bc399cd1868c3e37bc9110 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>)
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
     ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index cfc8793e1e05a7d4fa9207ae77a664b391b9a986..f3e88b0a0b05ef792b2cc8e880bdfddb6e6124d1 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -107,7 +107,9 @@ class SGDOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < grad.rows().size(); i++) {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        int64_t id_index = param.index(grad.rows()[i]);
+        int64_t id_index = param.Index(grad.rows()[i]);
+        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
+                          "id should be in the table");
         for (size_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 7b93f19bb2f7102824852aa181e3728f79025121..5db77d0493fc0abaa0a696cb559c3ca0534d4101 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -135,11 +135,12 @@ However the output only shares the LoD with input `X`.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
+                  ops::SigmoidCrossEntropyWithLogitsOp,
+                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
+                  ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
                            paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 658eb0195212cc3038fce6aab0ec3804efc59edf..322581fdef27b12a06704abc9c3b8772adf002f2 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -132,8 +132,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
+REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
     ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index dc2f1763446b2aaf72b20c72e8e37ec920abd120..d00bd1447e6114b6000b65799abb566a2a510127 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -73,6 +73,15 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                    softmax_dst_memory);
     std::vector<primitive> pipeline{softmax};
     stream(stream::kind::eager).submit(pipeline).wait();
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    if (!is_test) {
+      T threshold = exp(-64);
+      for (size_t i = 0; i < dst_tz[0] * dst_tz[1]; ++i) {
+        output_data[i] =
+            output_data[i] < threshold ? threshold : output_data[i];
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 6bdefc0f23910c90f3878d8f2634ca6e03c6f736..2741ba95bcfc1db3d74e0fb8c3f6fddf7d5a2caa 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -97,6 +97,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -157,8 +160,9 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
+REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7413ce3e9ce60ed733bb4d27e9ec205e5f0a7e1b
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ee6186f3541b7dcb845ce0c6d28081685925da0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedd7218dd6cc9481e94a92a3820cafbe4157bd0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    size_t row_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
+      row_offset += out->dims()[0];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index e745509ec8c1f2ec305d7d4aabfdd43d847124b5..a4398df36bcc2d3b8bbe8949f27f5d6508861d95 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -108,21 +108,6 @@ Example:
   }
 };
 
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index e2c41f44ab3ea3c42837974dae749278c9356ba5..f0c417c70521b1bb3816f884d6ab7393473999e4 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
   }
 };
 
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b..1cada95501a76da27081d533b451ce7f6a384a49 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -92,7 +92,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
 REGISTER_OP_CPU_KERNEL(
     spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 1c5e87040a8dd74b98d8e31bfe351ea256e01f15..c32f575b541d6a6441cc1b6e999496eacef421a5 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -109,9 +109,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
-            ops::SquaredL2DistanceGradOp);
+REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
+                  ops::SquaredL2DistanceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index b64df2a218860be3adb3954e07b036c05bf05c8e..4ce51259da3530367d91b5da34f06fbe5d969fce 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -67,8 +67,10 @@ $$Out = \sum_{i} X_{i}^2$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
+                  ops::SquaredL2NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
     ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 9f8482adedb4c29e32d4109941a2752d942ae49f..d44eeae8e6ff9ac87ab093d04e3f5427743f0c08 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
     // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<LoDTensor>("Out");
-    auto* indices = ctx.Output<LoDTensor>("Indices");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
     // k is determined by Attr
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 4aea9cd65bed615c84c95d891a0a4092678e1444..3555cb68cab97c0cf983f1173c3b4ca9307e4f7d 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -118,8 +118,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
-            ops::TransposeOpGrad);
+REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 31859fd1d70dc6e6387258cd5f7412e78a302567..b3cd87efa21115565b32659cb35fee4b5bed2d4f 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -132,8 +132,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
-            ops::UnpoolOpGrad);
+REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
     unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 940bf4fe7baa6a01a2143374b502c61d0b55fd77..6835a5dd6286ece20c4ce6f3e951ed4b0057012c 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -132,8 +132,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
-            ops::WarpCTCGradOp);
+REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index b93b925a72a55442c105e4280a3580f4ea5b93a1..364c4901b297dbd647faae85b01f682a1daace9c 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
-if (WITH_TENSORRT)
+if (TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index ca9ab2c7aecff47924f0198802d710b7661f5576..0013597fd516d15c7d502370eec77e1a6a5dca88 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
 
 class NCCLGroupGuard {
  public:
+  static std::mutex &NCCLMutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
+
   inline NCCLGroupGuard() {
-    mutex().lock();
+    NCCLMutex().lock();
     PADDLE_ENFORCE(dynload::ncclGroupStart());
   }
 
   inline ~NCCLGroupGuard() {
     PADDLE_ENFORCE(dynload::ncclGroupEnd());
-    mutex().unlock();
-  }
-
- private:
-  static std::mutex &mutex() {
-    static std::mutex mtx;
-    return mtx;
+    NCCLMutex().unlock();
   }
 };
 
@@ -68,26 +67,6 @@ struct NCCLContext {
   int device_id() const {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
   }
-
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
-                              const std::vector<platform::Place> &places) {
-    std::vector<ncclComm_t> comms;
-    std::vector<int> devs;
-    comms.resize(contexts->size());
-    devs.reserve(contexts->size());
-
-    for (auto &p : places) {
-      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
-    }
-
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
-
-    int i = 0;
-    for (auto &dev_id : devs) {
-      contexts->at(dev_id).comm_ = comms[i++];
-    }
-  }
 };
 
 struct NCCLContextMap {
@@ -107,12 +86,12 @@ struct NCCLContextMap {
         "NCCL Context Map does not support contain two or more same device");
 
     if (places.size() > 1) {
-      std::vector<ncclComm_t> comms;
-      comms.resize(order_.size());
-
-      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-          &comms[0], static_cast<int>(order_.size()), &order_[0]));
-
+      std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+      {
+        std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
+        PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+            comms.get(), static_cast<int>(order_.size()), order_.data()));
+      }
       int i = 0;
       for (auto &dev_id : order_) {
         contexts_.at(dev_id).comm_ = comms[i++];
@@ -120,6 +99,9 @@ struct NCCLContextMap {
     }
   }
 
+  NCCLContextMap(const NCCLContextMap &other) = delete;
+  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
+
   CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
   CUDADeviceContext *DevCtx(platform::Place p) const {
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 93533e5c9d88a9113d4d3eacb01901a8c14b6324..6471eb3ab7bf05365c0bb2bf68bb74ef9044c527 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -127,6 +127,9 @@ void BindProgramDesc(pybind11::module *m) {
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("flush", &pd::ProgramDesc::Flush)
+      .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
+      .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
       .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
            [](pd::ProgramDesc &program_desc, const std::string &data) {
@@ -299,6 +302,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("check_attrs", &pd::OpDesc::CheckAttrs)
       .def("infer_shape", &pd::OpDesc::InferShape)
       .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
       .def("block", &pd::OpDesc::Block,
            pybind11::return_value_policy::reference);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a1e8ff6399f0812773a7bb753c90e4400b1763d9..1f21e7abe76b2a32d6c18e5c26c4f25b65daef5b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -294,7 +295,7 @@ All parameter, weight, gradient are variables in Paddle.
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
     }
     proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
@@ -461,6 +462,9 @@ All parameter, weight, gradient are variables in Paddle.
         self.back().set_lod(t.lod());
       });
 
+  m.def("IsInplace",
+        [](std::string op) -> bool { return operators::IsInplace(op); });
+
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
@@ -505,11 +509,19 @@ All parameter, weight, gradient are variables in Paddle.
                                   scope, local_scopes, allow_op_delay);
            })
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
            },
            py::return_value_policy::reference)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
       .def("run", &ParallelExecutor::Run);
 
   BindRecordIOWriter(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4a9dbd324c90380e784cc9457845fabd858585be..159d1d5f4e70033fabf93514bd63b38f83675bff 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
       static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <>
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
   paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                    sizeof(uint16_t) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <typename T>
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2b2a904974f3756576fb47851400e344c9357c57..94628270228b9e7fd32405bdcb5e11c163ba4791 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,7 @@ EOF
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
     if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04"
     else
     BASE_IMAGE="ubuntu:16.04"
     fi
@@ -164,7 +164,7 @@ function gen_dockerfile() {
     DOCKERFILE_CUDNN_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
         DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
     fi
 
     cat <<EOF
@@ -207,8 +207,14 @@ EOF
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
-    ADD go/cmd/pserver/pserver /usr/bin/
-    ADD go/cmd/master/master /usr/bin/
+EOF
+    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
+        cat >> /paddle/build/Dockerfile <<EOF
+        ADD go/cmd/pserver/pserver /usr/bin/
+        ADD go/cmd/master/master /usr/bin/
+EOF
+    fi
+    cat >> /paddle/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD [${CMD}]
 EOF
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c7c0812fe2238d48903aa4c75bb8f1e9ecdb16c9..ea25f3ab351ca1feb085a8fbbfe53d8cee397bbf 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -46,11 +46,11 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-
-add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
+set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -61,7 +61,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
 if(NOT WITH_FLUID_ONLY)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f757411b853bacb9e03fc42fa2ef6593c3cde00f..e2502990d5b78eb0db7bdfd0c8ef9fb6688016df 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -37,6 +37,7 @@ from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
+from inference_transpiler import InferenceTranspiler
 import clip
 from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'clip',
     'SimpleDistributeTranspiler',
     'DistributeTranspiler',
+    'InferenceTranspiler',
     'memory_optimize',
     'release_memory',
     'profiler',
@@ -105,7 +107,8 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'eager_delete_scope'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += ['fraction_of_gpu_memory_to_use']
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index aa15392d7e4901e8ee23ad5b4370542232adc2a5..d07e0f696e79cfb98efc09a9f40d7961678b6af4 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -18,7 +18,7 @@ import math
 
 import distributed_splitter as splitter
 import framework
-from framework import Program, default_main_program, Variable
+from framework import Program, default_main_program, Variable, Parameter
 from . import core
 
 LOOKUP_TABLE_TYPE = "lookup_table"
@@ -222,8 +222,14 @@ class DistributeTranspiler:
 
         # step1: For large parameters and gradients, split them into smaller
         # blocks.
-        param_list = [pg[0] for pg in params_grads]
-        grad_list = [pg[1] for pg in params_grads]
+        param_list = []
+        grad_list = []
+        for p, g in params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            param_list.append(p)
+            grad_list.append(g)
 
         if self.has_distributed_lookup_table:
             param_list = [
@@ -362,21 +368,19 @@ class DistributeTranspiler:
             else:
                 recv_inputs.append(single_trainer_var)
 
-        # step3
-        optimize_block = pserver_program.create_block(0)
-        # step 4
+        # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2
+        # step 3.2
         # Iterate through the ops and append optimize op which
         # located on current pserver
         opt_op_on_pserver = []
         for _, op in enumerate(self.optimize_ops):
             if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
                 opt_op_on_pserver.append(op)
-        # step 4.3
+        # step 3.3
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then
         # append it into the sub program.
@@ -409,28 +413,30 @@ class DistributeTranspiler:
             else:
                 self._append_pserver_non_opt_ops(block, op)
 
-        append_block = optimize_block
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
         if len(lr_ops) > 0:
+            lr_decay_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(append_block, op)
-
-            append_block = pserver_program.create_block(append_block.idx)
+                self._append_pserver_non_opt_ops(lr_decay_block, op)
 
         # append op to the current block
-        per_opt_block = append_block
-        for _, opt_op in enumerate(opt_op_on_pserver):
+        pre_block_idx = pserver_program.num_blocks - 1
+        for idx, opt_op in enumerate(opt_op_on_pserver):
+            per_opt_block = pserver_program.create_block(pre_block_idx)
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
-                if ufind.is_connected(op, opt_op) and \
-                    op not in global_ops:
+                if ufind.is_connected(op, opt_op) and op not in global_ops:
                     __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(append_block.idx)
 
         # append global ops
-        for glb_op in global_ops:
-            __append_optimize_op__(glb_op, per_opt_block)
+        opt_state_block = None
+        if global_ops:
+            opt_state_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
+            for glb_op in global_ops:
+                __append_optimize_op__(glb_op, opt_state_block)
 
         # NOT USED: single block version:
         #
@@ -444,10 +450,10 @@ class DistributeTranspiler:
         prefetch_block = None
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
-            self._create_table_optimize_block(pserver_index, pserver_program,
-                                              append_block)
+            table_opt_block = self._create_table_optimize_block(
+                pserver_index, pserver_program, pre_block_idx)
             prefetch_block = self._create_prefetch_block(
-                pserver_index, pserver_program, optimize_block)
+                pserver_index, pserver_program, table_opt_block)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
@@ -463,7 +469,7 @@ class DistributeTranspiler:
             inputs={'X': recv_inputs},
             outputs={},
             attrs={
-                "OptimizeBlock": optimize_block,
+                "OptimizeBlock": pserver_program.block(1),
                 "endpoint": endpoint,
                 "Fanin": self.trainer_num,
                 "PrefetchBlock": prefetch_block
@@ -656,7 +662,7 @@ class DistributeTranspiler:
         return prefetch_block
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     append_block):
+                                     pre_block_idx):
         def _clone_var(block, var, persistable=True):
             assert isinstance(var, Variable)
             return block.create_var(
@@ -693,7 +699,7 @@ class DistributeTranspiler:
             op for op in self.optimize_ops
             if op.input("Param")[0] == self.table_name
         ][0]
-        table_opt_block = pserver_program.create_block(append_block.idx)
+        table_opt_block = pserver_program.create_block(pre_block_idx)
         # only support sgd now
         assert table_opt_op.type == "sgd"
 
@@ -717,6 +723,8 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=table_opt_op.attrs)
 
+        return table_opt_block
+
     # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
@@ -824,7 +832,7 @@ class DistributeTranspiler:
                 for v in splited_vars:
                     sections.append(v.shape[0])
                 program.global_block().append_op(
-                    type="split",
+                    type="split_byref",
                     inputs={"X": orig_var},
                     outputs={"Out": splited_vars},
                     attrs={"sections": sections}  # assume split evenly
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 13475025b5c2a759779066f9d511ed8a786118d5..1ee1d3727174c079d2c217dede27ff1a0316c01c 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -22,7 +22,6 @@ from layer_helper import LayerHelper
 from initializer import Constant
 
 __all__ = [
-    'Accuracy',
     'ChunkEvaluator',
     'EditDistance',
     'DetectionMAP',
@@ -273,7 +272,7 @@ class DetectionMAP(Evaluator):
         input (Variable): The detection results, which is a LoDTensor with shape
             [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
         gt_label (Variable): The ground truth label index, which is a LoDTensor
-            with shape [N, 1]. 
+            with shape [N, 1].
         gt_difficult (Variable): Whether this ground truth is a difficult
             bounding box (bbox), which is a LoDTensor [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7..7ad028714d3b47d93328dbf7c3297d55a2db1bd0 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -151,7 +151,7 @@ def fetch_var(name, scope=None, return_numpy=True):
         scope = global_scope()
     assert isinstance(scope, core.Scope)
 
-    var = global_scope().find_var(name)
+    var = scope.find_var(name)
     assert var is not None, (
         "Cannot find " + name + " in scope. Perhaps you need to make the"
         " variable persistable by using var.persistable = True in your"
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4b841ef31dcb67ab660475cf6e231fd8a4ae83d6..340882ea9e7b0e2a0c52749c771308c6b860ed07 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1070,6 +1070,12 @@ class Program(object):
         for t in targets:
             if not isinstance(t, Operator):
                 if isinstance(t, Variable):
+                    if t.op is None:
+                        global_block = self.global_block()
+                        for op in global_block.ops:
+                            if t.name in op.output_arg_names:
+                                t.op = op
+                                break
                     t = t.op
                 else:
                     raise ValueError(("All targets of prune() can only be "
@@ -1101,6 +1107,10 @@ class Program(object):
     def random_seed(self):
         return self._seed
 
+    @property
+    def num_blocks(self):
+        return self.desc.num_blocks()
+
     @random_seed.setter
     def random_seed(self, seed):
         if not isinstance(seed, int):
diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/inference_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..39b01610f96018e1775405a30147e77006cecc16
--- /dev/null
+++ b/python/paddle/fluid/inference_transpiler.py
@@ -0,0 +1,240 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from framework import Program
+from executor import global_scope
+from . import core
+
+
+class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        '''
+        Transpile the program. Support only fuse batch normalization now.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope or None
+        '''
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core.Scope):
+            raise TypeError("scope should be as Scope type or None")
+        self.fuse_batch_norm(program, place, scope)
+
+    def fuse_batch_norm(self, program, place, scope):
+        '''
+        Transpile the program by fused batch normalization.
+ 
+        The batch normalization followed the convolution or fully connected layer 
+        can be integrated with them. Doing so will give us a forward acceleration, 
+        especially in environments like mobile or embedded.
+                    
+        For input X:
+        - Conv process:        X = input * W + bias 
+        - Batch norm process:  X' = (X - mean) / std 
+        - Scale Process:       Y = a * X' + b
+
+        After fuse into one operation:
+
+        Y = (input * W + bias - mean) / std * a + b
+          = input * a * W / std + ((bias - mean) / std * a + b)
+
+        The operator transformation is: 
+        - before:
+          - conv->batch_norm->any_other_op (bias == 0)
+          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
+        - after: 
+          - conv->elementwise_add->any_other_op
+        
+        The transpile stages are:
+        1. insert elementwise_add op when bias == 0.
+        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
+        3. remove batch_norm ops which are not used in any other ops.
+        4. adjust the input of any_other_op to be the output of elementwise_add operator.
+        5. remove unused variables.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope
+        '''
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted 
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            # TODO(luotao1): consider only conv2d now. fc would be delt later.
+            if current_op.type in ['conv2d']:
+                # TODO(luotao1): consider single chain network now. 
+                # For branch network, we counldn't use block.ops[i + 1] as 
+                # the judgment condition.
+                next_op = self.block.ops[i + 1]
+                # conv2d without bias
+                if (next_op.type == 'batch_norm'):
+                    # insert bias op
+                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
+                    # fuse batch_norm
+                    self._fuse_param(current_op, next_op, bias_op, 0)
+                    # remove batch_norm_op
+                    self.block.remove_op(i + 2)
+                    i = i + 1
+                # conv2d with bias, the next_op.type is elementwise_add
+                elif (next_op.type == 'elementwise_add'):
+                    next_next_op = self.block.ops[i + 2]
+                    if (next_next_op.type == 'batch_norm'):
+                        # fuse batch_norm
+                        self._fuse_param(current_op, next_next_op, next_op, 1)
+                        # remove batch_norm_op
+                        self.block.remove_op(i + 2)
+                        i = i + 1
+            i = i + 1
+
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force, 
+        # since some large program.desc will not be flushed immediately. 
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _insert_bias_op(self, index, current_op, bn_op):
+        '''
+        Construct elementwise_add operator for adding bias 
+        and insert it into program.
+        
+        :param index: insert location of bias_op
+        :type index: Int
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :return: bias_op
+        :rtype: Operator
+        '''
+        # The input of bias_op is current_op's output and Bias of bn_op
+        # The output of bias_op is bn_op's output
+        x_var = self.block.var(current_op.output("Output")[0])
+        y_var = self.block.var(bn_op.input("Bias")[0])
+        out_var = self.block.var(bn_op.output("Y")[0])
+
+        bias_op = self.block.insert_op(
+            index,
+            type="elementwise_add",
+            inputs={"X": x_var,
+                    "Y": y_var},
+            outputs={"Out": out_var},
+            attrs={"axis": 1})  # dim_start=1
+        return bias_op
+
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        '''
+        fuse the batch_norm_op' parameters to current_op (conv or fc)
+        
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :param bias_op: elementwise_add operator for adding bias
+        :type bias_op: Operator
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
+        :type with_bias: Int
+        '''
+
+        def _update_param(op, old_param_name, new_param):
+            # For the sake of remaining the original variables the same as before,
+            # create new variables in scope to store the new parameters.
+            old_param_name = old_param_name[0]
+            old_var = self.block.vars[old_param_name]
+            new_param_name = old_param_name + '_fuse_bn'
+            new_var = self.block.create_parameter(
+                name=new_param_name.encode('ascii'),
+                type=old_var.type,
+                dtype=old_var.dtype,
+                shape=old_var.shape)
+            op.rename_input(old_param_name, new_param_name)
+            self.scope.var(new_param_name)
+
+            tensor = self.scope.find_var(new_param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name[0]).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
+        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
+        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
+        var_bn = _load_param(bn_op.input("Variance"))  #Variance
+
+        # TODO(luotao1): consider only conv2d now. fc would be delt later.
+        current_param = _load_param(current_op.input("Filter"))
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        _update_param(current_op, current_op.input("Filter"), dst_param)
+        _update_param(bias_op, bias_op.input("Y"), bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
+
+    def _adjust_input(self):
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op.rename_input(input_arg,
+                                            self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in self.block.vars.keys():
+            if var not in args:
+                self.block.remove_var(var)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1c0f1f6eb415b1c05c1052c1f52743a19c49f017..f7f1ca2598a3e679b24fa8d62c52e4f4de788fe2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -336,11 +336,20 @@ def save_inference_model(dirname,
 
     if main_program is None:
         main_program = default_main_program()
+    copy_program = main_program
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = main_program.prune(targets=target_vars)
+    # Clear the is_target information and remove the existed feed and fetch op
+    global_block = copy_program.global_block()
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            global_block.remove_op(i)
+    copy_program.desc.flush()
+
+    pruned_program = copy_program.prune(targets=target_vars)
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
@@ -362,24 +371,6 @@ def save_inference_model(dirname,
     save_persistables(executor, dirname, inference_program, params_filename)
 
 
-def get_feed_targets_names(program):
-    feed_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'feed':
-            feed_targets_names.insert(0, op.desc.output('Out')[0])
-    return feed_targets_names
-
-
-def get_fetch_targets_names(program):
-    fetch_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'fetch':
-            fetch_targets_names.append(op.desc.input('X')[0])
-    return fetch_targets_names
-
-
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
@@ -418,8 +409,8 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
-    feed_target_names = get_feed_targets_names(program)
-    fetch_target_names = get_fetch_targets_names(program)
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
         program.global_block().var(name) for name in fetch_target_names
     ]
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index d771837fc545167f7c32fcf914dd1c3c3ae64fb3..62933b512529bd04fab8c9ded12e636ecfae685c 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
 import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr, WeightNormParamAttr
+import core
 
 
 class LayerHelper(object):
@@ -398,13 +399,16 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
 
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
+        tmp = input_var
+        # NOTE(dzhwinter): some activation support inplace compution.
+        if not core.IsInplace(act_type):
+            tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b9a53eda9144e9e56cf9bc626db40cf4225bd87f..4b707973e27391a6bdcba138934f62a255e04bb2 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -32,7 +32,6 @@ __all__ = [
     'Switch',
     'lod_rank_table',
     'max_sequence_len',
-    'topk',
     'lod_tensor_to_array',
     'array_to_lod_tensor',
     'increment',
@@ -751,43 +750,6 @@ def max_sequence_len(rank_table):
     return res
 
 
-def topk(input, k):
-    """
-    **topk**
-
-    This function performs the operation that selects the k entries in the input
-    vector and outputs their values and indices as vectors. Thus topk_out[j] is
-    the j-th largest entry in input, and its index is topk_indices[j]
-
-    Args:
-        input (Variable|list): The input tensor that has all the data.
-        k (int): The number of top elements that the function will pick.
-
-    Returns:
-        Variable: The variable of type array that contains the k largest entries
-                  from input.
-        Variable: The variable of type array that contains the indices of k
-                  largest entries from input.
-
-    Examples:
-        .. code-block:: python
-
-          x = fluid.layers.data(name='x', shape=[10])
-          k = 5
-          array = fluid.layers.topk(x, k)
-    """
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
-
-
 def lod_tensor_to_array(x, table):
     """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index e7d6c4e2521bee133c4794ed1db669b02fc2152b..34382fb9fecdc256ae8fe3fcdaf1effd6e2597cb 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
 from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
@@ -21,7 +21,7 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
 ]
 
 
@@ -158,6 +158,7 @@ class ListenAndServ(object):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
+        empty_block = Program().global_block()
 
         parent_block.append_op(
             type='listen_and_serv',
@@ -166,11 +167,12 @@ class ListenAndServ(object):
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'OptimizeBlock': current_block
+                'OptimizeBlock': current_block,
+                'PrefetchBlock': empty_block
             })
 
 
-def Send(endpoints, send_vars, get_vars):
+def Send(endpoints, send_vars, get_vars=None):
     """
     Send layer
 
@@ -184,7 +186,6 @@ def Send(endpoints, send_vars, get_vars):
     side when server have finished running server side program.
     """
     assert (type(send_vars) == list)
-    assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
@@ -192,6 +193,11 @@ def Send(endpoints, send_vars, get_vars):
     helper = LayerHelper("Send", **locals())
     rpc_client_var = default_main_program().global_block().create_var(
         name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
+    if not get_vars:
+        get_vars = []
+        for s in send_vars:
+            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
+            get_vars.append(v)
 
     helper.append_op(
         type="send",
@@ -200,6 +206,7 @@ def Send(endpoints, send_vars, get_vars):
                  "RPCClient": rpc_client_var},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    return get_vars
 
 
 def Recv(endpoints, get_vars):
@@ -283,7 +290,7 @@ def open_recordio_file(filename,
                        lod_levels,
                        dtypes,
                        pass_num=1,
-                       for_parallel=False):
+                       for_parallel=True):
     """
     Open a RecordIO file
 
@@ -357,7 +364,7 @@ def open_files(filenames,
                thread_num,
                buffer_size=None,
                pass_num=1,
-               for_parallel=False):
+               for_parallel=True):
     """
     Open files
 
@@ -469,6 +476,11 @@ def shuffle(reader, buffer_size):
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
+def batch(reader, batch_size):
+    return __create_unshared_decorated_reader__(
+        'create_batch_reader', reader, {'batch_size': int(batch_size)})
+
+
 def double_buffer(reader, place=None):
     attrs = dict()
     if place is not None:
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 65b95a58d6546ed6d6b264443a7c802e16eef23f..d13c54daa5a985e2e1bf9357630fe29d24a17bb4 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
 """
 When training a model, it's often useful to decay the
@@ -32,14 +32,41 @@ strategy according to this module.
 """
 
 
-def _decay_step_counter():
+def _decay_step_counter(begin=0):
     # the first global step is zero in learning rate decay
     global_step = nn.autoincreased_step_counter(
-        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     global_step = tensor.cast(global_step, 'float32')
     return global_step
 
 
+def noam_decay(d_model, warmup_steps):
+    """Apply decay to learning rate.
+    ```python
+    lr_value = np.power(d_model, -0.5) * np.min([
+            np.power(current_steps, -0.5),
+            np.power(warmup_steps, -1.5) * current_steps
+        ])
+    ```
+
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+            Reference: attention is all you need
+                https://arxiv.org/pdf/1706.03762.pdf
+        warmup_steps(Variable): A super parameter.
+
+    Returns:
+        The decayed learning rate.
+    """
+    global_step = _decay_step_counter(1)
+    with init_on_cpu():
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+
+    return lr_value
+
+
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """Applies exponential decay to the learning rate.
 
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index f66dccfa2d040ea0a9d29daeaa1d2da640525959..cab2eb55510542bdd4dd7eca7667601697759181 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
+import nn
 
 __all__ = ['accuracy', 'auc']
 
@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
     """
     This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
+    The output is the top k inputs and their indices.
     """
     helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
+    topk_out, topk_indices = nn.topk(input, k=k)
     acc_out = helper.create_tmp_variable(dtype="float32")
     if correct is None:
         correct = helper.create_tmp_variable(dtype="int64")
@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     helper = LayerHelper("auc", **locals())
     topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
+    topk_out, topk_indices = nn.topk(input, k=k)
     auc_out = helper.create_tmp_variable(dtype="float32")
     if correct is None:
         correct = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bba8b64bd88c3edc6eda110dde38c0ced50439f6..5e6abceb0a8c2a97a804d6563b5390a245208e3f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -60,6 +60,7 @@ __all__ = [
     'edit_distance',
     'l2_normalize',
     'matmul',
+    'topk',
     'warpctc',
     'sequence_reshape',
     'transpose',
@@ -88,6 +89,7 @@ def fc(input,
        bias_attr=None,
        use_mkldnn=False,
        act=None,
+       is_test=False,
        name=None):
     """
     **Fully Connected Layer**
@@ -134,6 +136,7 @@ def fc(input,
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to None, no bias will be added to the output units.
         act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
             library is installed. Default: False
         name (str, default None): The name of this layer.
@@ -177,8 +180,11 @@ def fc(input,
             inputs={"Input": input,
                     "W": w},
             outputs={"Out": tmp},
-            attrs={"use_mkldnn": use_mkldnn,
-                   "bias_attr": bias_attr})
+            attrs={
+                "use_mkldnn": use_mkldnn,
+                "is_test": is_test,
+                "bias_attr": bias_attr
+            })
         return helper.append_activation(tmp)
     else:
         for input_var, param_attr in helper.iter_inputs_and_params():
@@ -2571,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
+def topk(input, k):
+    """
+    This operator is used to find values and indices of the k largest entries
+    for the last dimension.
+
+    If the input is a vector (rank=1), finds the k largest entries in the vector
+    and outputs their values and indices as vectors. Thus values[j] is the j-th
+    largest entry in input, and its index is indices[j].
+
+    If the input is a Tensor with higher rank, this operator computes the top k
+    entries along the last dimension.
+
+    Args:
+        input(Variable): The input variable which can be a vector or Tensor with
+            higher rank.
+        k(int): An integer value to specify the top k largest elements.
+
+    Returns:
+        values(Variable): The k largest elements along each last dimensional
+            slice.
+        indices(Variable): The indices of values within the last dimension of
+            input.
+
+    Examples:
+        .. code-block:: python
+
+            top5_values, top5_indices = layers.topk(input, k=5)
+    """
+    shape = input.shape
+    if k < 1 and k >= shape[-1]:
+        raise ValueError("k must be greater than 0 and less than %d." %
+                         (shape[-1]))
+
+    helper = LayerHelper("top_k", **locals())
+    values = helper.create_tmp_variable(dtype=input.dtype)
+    indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs={"k": k})
+    values.stop_gradient = True
+    indices.stop_gradient = True
+    return values, indices
+
+
 def edit_distance(input, label, normalized=True, ignored_tokens=None,
                   name=None):
     """
@@ -2635,7 +2688,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
         helper.append_op(
             type="sequence_erase",
             inputs={"X": [label]},
-            outputs={"Out": [erase_label]},
+            outputs={"Out": [erased_label]},
             attrs={"tokens": ignored_tokens})
         label = erased_label
 
@@ -2712,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None):
             cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
     """
     helper = LayerHelper("ctc_greedy_decoder", **locals())
-    # top 1 op
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": 1})
+    _, topk_indices = topk(input, k=1)
 
     # ctc align op
     ctc_out = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 36503cac6d5391821b977d90e6b77c4df7e3b564..9ae43b3e93e4b7d337097a25379720c18dfd331c 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -27,7 +27,8 @@ from contextlib import contextmanager
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
-    'Adadelta', 'ModelAverage'
+    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage'
 ]
 
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5ce2aa1fc4d0b275b502af0f97e4a0f83e85de5b..fbdd6fd449625a21f91758dc12490b02070aea1a 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,6 +16,8 @@ import core
 import multiprocessing
 import framework
 import executor
+import warnings
+import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -61,8 +63,8 @@ class ParallelExecutor(object):
                   main_program=test_program,
                   share_vars_from=train_exe)
 
-              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+              train_loss, = train_exe.run([loss.name], feed=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed=feed_dict)
         """
 
         self._places = []
@@ -102,8 +104,8 @@ class ParallelExecutor(object):
 
         self.persistable_vars = [
             v.name
-            for v in filter(lambda var: \
-                var.persistable and var.type != core.VarDesc.VarType.RAW,
+            for v in filter(
+                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
                 main.list_vars())
         ]
 
@@ -123,28 +125,93 @@ class ParallelExecutor(object):
             allow_op_delay)
         self.scope = scope
 
-    def run(self, fetch_list, feed_dict={}):
+    def run(self, fetch_list, feed=None, feed_dict=None):
         """
-        :param fetch_list: A list of variable names that will be fetched.
-        :param feed_dict: A dict mapping for feed variable name to LoDTensor
-          or numpy array.
-        :return: fetched value list.
-        """
-        if not isinstance(feed_dict, dict):
-            raise TypeError("feed_dict should be a dict")
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])
+
+
+        Args:
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter is deprecated.
 
-        feed_tensor_dict = {}
-        for i, feed_name in enumerate(feed_dict):
-            feed_tensor = feed_dict[feed_name]
-            if not isinstance(feed_tensor, core.LoDTensor):
-                feed_tensor = core.LoDTensor()
-                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
-            feed_tensor_dict[feed_name] = feed_tensor
+        Returns: fetched result list.
+
+        """
+        if feed is None and feed_dict is not None:
+            feed = feed_dict
+            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
 
         fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
+        self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
         self.executor.bcast_params(set(self.persistable_vars))
+
+    @property
+    def device_count(self):
+        return len(self._act_places)
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 604c6f9ab36c2332223d1ba943d67113922615b3..c006bd9a66ddb422b7d80d2ca87aa7f56a6485db 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -16,9 +16,8 @@ import framework
 from . import core
 
 __all__ = [
-    'append_regularization_ops',
-    'L1Decay',
-    'L2Decay',
+    'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay',
+    'L1DecayRegularizer', 'L2DecayRegularizer'
 ]
 
 
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index e8bb082be196b6342b1719235f1264bbe3d776ac..d3c14b83fa74f3a4016ae13442846fad1f9e41fc 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -22,10 +22,17 @@ import sys
 import numpy
 import unittest
 import os
+import numpy as np
 
 
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
         tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
@@ -33,7 +40,7 @@ def resnet_cifar10(input, depth=32):
             stride=stride,
             padding=padding,
             act=None,
-            bias_attr=False)
+            bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -44,7 +51,7 @@ def resnet_cifar10(input, depth=32):
 
     def basicblock(input, ch_in, ch_out, stride):
         tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
         short = shortcut(input, ch_in, ch_out, stride)
         return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
@@ -219,13 +226,32 @@ def infer(use_cuda, save_dirname=None):
         batch_size = 1
         tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
 
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
+
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=6)
+
         print("infer results: ", results[0])
 
+        fluid.io.save_inference_model(save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
+
 
 def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 356c3e64b3d03b520a1bec5b5e0174e1d8ee23e8..d9190408e151283ece8460286dd67818dd39da3e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,10 +1,13 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one kernel (MKLDNN)
+# The MKLDNN tests are skiped when the MKLDNN flag is OFF
 if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM TEST_OPS test_fc_op)
+    foreach(src ${TEST_OPS})
+        if(${src} MATCHES ".*_mkldnn_op$")
+            list(REMOVE_ITEM TEST_OPS ${src})
+        endif()
+    endforeach()
 endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
@@ -62,6 +65,7 @@ list(REMOVE_ITEM TEST_OPS test_registry)
 list(REMOVE_ITEM TEST_OPS test_fetch_var)
 list(REMOVE_ITEM TEST_OPS test_parallel_op)
 list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
+list(REMOVE_ITEM TEST_OPS test_dist_train)
 
 # tests that can be bundled together in one python process for speed.
 if(WITH_FAST_BUNDLE_TEST)
@@ -100,3 +104,4 @@ py_test_modules(test_registry MODULES test_registry)
 py_test_modules(test_fetch_var MODULES test_fetch_var)
 py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
 py_test_modules(test_parallel_op MODULES test_parallel_op)
+py_test_modules(test_dist_train MODULES test_dist_train)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d554c2276c9acd710d14c8f8b32c802e3e17515
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit
+from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+
+
+class TestMKLDNNReluDim2(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNReluDim4(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim4(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim4(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 57d4a50e913c0d2994c62600f4e479056ed4c306..5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -361,10 +361,7 @@ class TestCeil(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    # The same reason with TestFloor
 
     def init_dtype(self):
         pass
@@ -396,10 +393,8 @@ class TestFloor(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    # the gradient on floor, ceil, round is undefined.
+    # we return zero as gradient, but the numpy return nan 
 
     def init_dtype(self):
         pass
@@ -501,11 +496,6 @@ class TestRound(OpTest):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
-
     def init_dtype(self):
         pass
 
@@ -1098,82 +1088,5 @@ class TestFP16Swish(TestSwish):
                 self.check_output_with_place(place, atol=1e-3)
 
 
-#--------------------test MKLDNN--------------------
-class TestMKLDNNReluDim2(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim2(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim2(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNReluDim4(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim4(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim4(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim4(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 7ecf9a1459ffc9740ae8c12df3902163ee689f59..6afb6fa6e753d3d6478313c840b158c3895b3efb 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -100,6 +100,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    if data_format != "NCHW" and data_format != "NHWC":
+        raise ValueError("Unknown data order.")
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
@@ -304,7 +307,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             # run backward
             y_grad = np.random.random_sample(shape).astype(np.float32)
             x_grad, scale_grad, bias_grad = _reference_grad(
-                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6be21baaa54d33af9f5c44d1815e4b389eb884
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 65606a0b4373b28036096cf046da5143a3b8bcd0..a478649541ba9828e55c4239090d5aee554223ac 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -373,22 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
-
-#----------------Conv2dMKLDNN----------------
-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index d5dd63e8737cbdd9b91d083fbd0b38f8baf570b3..7703dfe0135b402f830bcdeaf47c26e5e3f8ca58 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -97,15 +97,18 @@ class TestConv3dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -117,7 +120,7 @@ class TestConv3dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -132,7 +135,7 @@ class TestConv3dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
similarity index 57%
rename from python/paddle/fluid/tests/unittests/test_recv_op.py
rename to python/paddle/fluid/tests/unittests/test_dist_train.py
index 2ebceca7e4b7b824194d94180462870e6cfe6d21..c7fdd06f105e3b5fd906d3524d41df8f84160e63 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -15,31 +15,42 @@
 import unittest
 
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 import numpy
 from multiprocessing import Process
+from threading import Thread
 import os, sys
 import time
 
 
-class TestRecvOp(unittest.TestCase):
-    def no_test_send(self):
+class TestSendOp(unittest.TestCase):
+    def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
+        # NOTE: python thread will not work here due to GIL.
         p = Process(target=self.init_serv, args=(place, ))
         p.daemon = True
         p.start()
-        time.sleep(1)
-        self.init_client(place)
+
+        time.sleep(10)
+        with open("/tmp/paddle.selected_port", "r") as fn:
+            selected_port = int(fn.readlines()[0])
+        self.init_client(place, selected_port)
+
+        self.run_local(place)
+        self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
+
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
         os.system("kill -9 %d" % p.pid)
         p.join()
 
     def init_serv(self, place):
         main = fluid.Program()
+
         with fluid.program_guard(main):
             serv = layers.ListenAndServ(
-                "127.0.0.1:6174", ["X"], optimizer_mode=False)
+                "127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
                 x = layers.data(
                     shape=[32, 32],
@@ -50,10 +61,29 @@ class TestRecvOp(unittest.TestCase):
                 o = layers.scale(x=x, scale=10.0)
             main.global_block().create_var(
                 name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
+
+        self.server_exe = fluid.Executor(place)
+        self.server_exe.run(main)
+
+    def init_client(self, place, port):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            get_var = main.global_block().create_var(
+                name="scale_0.tmp_0",  # server side var
+                dtype="float32",
+                persistable=False,
+                shape=[32, 32])
+            o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
         exe = fluid.Executor(place)
-        exe.run(main)
+        self.dist_out = exe.run(main, fetch_list=o)  # o is a list
 
-    def init_client(self, place):
+    def run_local(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
             x = layers.data(
@@ -61,10 +91,10 @@ class TestRecvOp(unittest.TestCase):
                 dtype='float32',
                 name='X',
                 append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            layers.Send("127.0.0.1:6174", [x], [x])
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            o = layers.scale(x=x, scale=10.0)
         exe = fluid.Executor(place)
-        exe.run(main)
+        self.local_out = exe.run(main, fetch_list=[o])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fc_op.py
rename to python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a1be2d671ddc5c689b16319fcf5bf12dca5dde7e..17d6afdee161426e5da398ffa2ec148a027c905e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -350,6 +350,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(smooth_label)
         print(str(program))
 
+    def test_topk(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name="label", shape=[200], dtype="float32")
+            values, indices = layers.topk(data, k=5)
+            self.assertIsNotNone(values)
+            self.assertIsNotNone(indices)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a16dc870c041b9deb140bed57d907cf305fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_lrn_op import TestLRNOp
+
+
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 8fa480b9bce84d2936f23cce9e41e8e54014b074..eaff45cbb2a58798e9d55149510bec72eea370cd 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,34 +87,5 @@ class TestLRNOp(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
-class TestLRNMKLDNNOp(TestLRNOp):
-    def get_attrs(self):
-        attrs = TestLRNOp.get_attrs(self)
-        attrs['use_mkldnn'] = True
-        return attrs
-
-    def test_check_output(self):
-        self.check_output(atol=0.002)
-
-
-class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
-    def get_attrs(self):
-        attrs = TestLRNMKLDNNOp.get_attrs(self)
-        attrs['is_test'] = True
-        return attrs
-
-    def test_check_grad_normal(self):
-        def check_raise_is_test():
-            try:
-                self.check_grad(['X'], 'Out', max_relative_error=0.01)
-            except Exception as e:
-                t = \
-                "is_test attribute should be set to False in training phase."
-                if t in str(e):
-                    raise AttributeError
-
-        self.assertRaises(AttributeError, check_raise_is_test)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index 5dc41e54d6158787eb966333c894e378b5c706d0..3f940203b9393d266d75b50c9cbf62e89c36cbdf 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -69,7 +69,6 @@ class TestMultipleReader(unittest.TestCase):
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
-            data_files.reset()
             self.assertEqual(batch_count, self.num_batch * 3)
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 1471843ded7a42432a84a9fad76bb97dcf7fb9c2..52e7cc1ffbba40a63ce3cec645c7c0a7a499c1bf 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -43,9 +43,8 @@ class TestMultipleReader(unittest.TestCase):
                 filename='./mnist.recordio',
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
-            data_file = fluid.layers.io.multi_pass(
-                reader=data_file, pass_num=self.pass_num)
+                dtypes=['float32', 'int64'],
+                pass_num=self.pass_num)
             img, label = fluid.layers.read_file(data_file)
 
             if fluid.core.is_compiled_with_cuda():
@@ -65,5 +64,4 @@ class TestMultipleReader(unittest.TestCase):
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
-            data_file.reset()
             self.assertEqual(batch_count, self.num_batch * self.pass_num)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 83d22fd799eea55eedb58f93421b275985edb50b..c783a142467f3f6a9cd210425acfc526a32a6f71 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -200,34 +200,56 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(self,
                                   method,
                                   memory_opt=True,
-                                  iter=10,
+                                  iter=50,
                                   batch_size=None,
                                   allow_op_delay=False,
-                                  feed_dict={}):
+                                  feed_dict=None,
+                                  seed=None,
+                                  use_parallel_executor=True):
+        def run_executor(exe, feed, fetch_list, program=None):
+            if isinstance(exe, fluid.ParallelExecutor):
+                res = exe.run(fetch_list=fetch_list, feed=feed)
+            elif isinstance(exe, fluid.Executor):
+                if program is None:
+                    program = fluid.default_main_program()
+                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
+            else:
+                raise ValueError('Unkown type exe')
+            return res
+
         main = fluid.Program()
         startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
         with fluid.program_guard(main, startup):
-            loss = method(use_feed=len(feed_dict) > 0)
+            if seed is not None:
+                startup.random_seed = seed
+            loss = method(use_feed=feed_dict is not None)
             adam = fluid.optimizer.Adam()
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
-
             place = fluid.CUDAPlace(0)
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
 
-            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    True, loss_name=loss.name, allow_op_delay=allow_op_delay)
+            else:
+                exe = fluid.Executor(place=place)
+
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
-            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            first_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
             first_loss = numpy.array(first_loss)
 
             for i in xrange(iter):
-                exe.run([], feed_dict=feed_dict)
+                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
 
-            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            last_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
             end = time.time()
 
             if batch_size is not None:
@@ -238,6 +260,7 @@ class TestParallelExecutorBase(unittest.TestCase):
 
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
+            return first_loss, last_loss
 
 
 class TestMNIST(TestParallelExecutorBase):
@@ -267,6 +290,27 @@ class TestMNIST(TestParallelExecutorBase):
             simple_fc_net, feed_dict={"image": img,
                                       "label": label})
 
+    def test_simple_fc_parallel_accuracy(self):
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=True)
+
+        for p_f in parallel_first_loss:
+            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
+        for p_l in parallel_last_loss:
+            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+
     def test_batchnorm_fc(self):
         self.check_network_convergence(fc_with_batchnorm)
         img = numpy.zeros(shape=[32, 784], dtype='float32')
@@ -495,10 +539,10 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                 share_vars_from=train_exe)
 
             for i in xrange(5):
-                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
                 test_loss = numpy.array(test_loss)
 
-                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
                 train_loss = numpy.array(train_loss)
                 self.assertTrue(
                     numpy.allclose(
@@ -648,5 +692,5 @@ class TestCRFModel(unittest.TestCase):
             for i in xrange(10):
                 cur_batch = next(data)
                 print map(numpy.array,
-                          pe.run(feed_dict=feeder.feed(cur_batch),
+                          pe.run(feed=feeder.feed(cur_batch),
                                  fetch_list=[avg_cost.name]))[0]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..003ebba18b26198427d9f313596ae85656ac24fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+
+
+class TestMKLDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 764fa575fba1615de3171e848890b3836e640849..f7e1e8573290766cde0c35816d687e7ba6fa4220 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -109,8 +109,11 @@ class TestPool2d_Op(OpTest):
 
         self.outputs = {'Out': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -119,7 +122,7 @@ class TestPool2d_Op(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
@@ -317,36 +320,5 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
-#--------------------test pool2d MKLDNN--------------------
-class TestMKLDNNCase1(TestPool2d_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 15a8ac5e2029eec204d061d1832df3df90339697..aaa94842513691c836e04353aa4bc5ce5e66c5c3 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -118,15 +118,18 @@ class TestPool3d_Op(OpTest):
 
         self.outputs = {'Out': output.astype('float32')}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 7c8e7f634fdd3ee3f056a95df774402a7c29e906..f32050014d7ace5aee4aca75a47bfc6a75ff91c2 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -74,13 +74,13 @@ class TestRecordIO(unittest.TestCase):
 
                 avg_loss_np.append(tmp)
                 batch_id += 1
-            data_file.reset()
             self.assertEqual(batch_id, self.num_batches)
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
 
     def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(
+            reader, buffer_size=200))
 
     def test_double_buffer_reader(self):
         self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
-                                                                                                  place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
+                                                                                       place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 887bdfe8b3608878bace5b857a71ada123b74b2f..eb49a53e54f4bdb6bcd6cb1991423970f29997bb 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,7 +19,7 @@ from op_test import OpTest
 
 class TestSplitOp(OpTest):
     def setUp(self):
-        self.op_type = "split"
+        self._set_op_type()
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)
@@ -28,6 +28,9 @@ class TestSplitOp(OpTest):
         self.outputs = {'Out': [('out%d' % i, out[i]) \
             for i in xrange(len(out))]}
 
+    def _set_op_type(self):
+        self.op_type = "split"
+
     def test_check_output(self):
         self.check_output()
 
@@ -35,5 +38,10 @@ class TestSplitOp(OpTest):
         self.check_grad(['X'], ['out0', 'out1', 'out2'])
 
 
+class TestSplitByrefOp(OpTest):
+    def _set_op_type(self):
+        self.op_type = "split_byref"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 37c4296f9bcea7e16daa46f778934331513c30c4..00c2a3b9928d1ca5f3e8cd5e87ba7ad4108e9dad 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -124,7 +124,7 @@ def test(word_idx):
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
-def word_dict():
+def word_dict(cutoff=150):
     """
     Build a word dictionary from the corpus.
 
@@ -132,7 +132,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
 
 
 def fetch():
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
index 837fcbb8512bce027ecd09a7f39b806151e9154b..22a468466afbcbf7cc312e714e41a3b5adf1160c 100644
--- a/tools/aws_benchmarking/README.md
+++ b/tools/aws_benchmarking/README.md
@@ -84,7 +84,8 @@ putcn/paddle_aws_client \
 --security_group_id <your security group id> \
 --docker_image myreponame/paddle_benchmark \
 --pserver_count 2 \
---trainer_count 2
+--trainer_count 2 \
+--trainer_command batch_size:20,local:no,device:CPU
 ```
 
 Now just wait until you see this:
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
index 594378ff8fc0744a4b11b1c11e2e3b270be7aed0..12333202b9f003ae5109c7e9b825035ba8eb7d99 100644
--- a/tools/aws_benchmarking/client/cluster_launcher.py
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -80,7 +80,11 @@ parser.add_argument(
     use ami-1ae93962 for us-east-2")
 
 parser.add_argument(
-    '--pserver_command', type=str, default="", help="pserver start command")
+    '--pserver_command',
+    type=str,
+    default="",
+    help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
 
 parser.add_argument(
     '--trainer_image_id',
@@ -90,7 +94,11 @@ parser.add_argument(
     use ami-1ae93962 for us-west-2")
 
 parser.add_argument(
-    '--trainer_command', type=str, default="", help="trainer start command")
+    '--trainer_command',
+    type=str,
+    default="",
+    help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
 
 parser.add_argument(
     '--availability_zone',
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
index 21f85a5fc43e951897eb6b785367630abda722c0..7952e61159ec31a4be5394b50f30cbc20f9b414e 100644
--- a/tools/aws_benchmarking/server/cluster_master.py
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -19,6 +19,7 @@ import math
 import time
 import threading
 import logging
+import copy
 
 import netaddr
 import boto3
@@ -257,6 +258,8 @@ def script_to_str(file_path):
 
 
 def run_instances(image_id, instance_type, count, role, cmd=""):
+    if count == 0:
+        return []
     response = ec2client.run_instances(
         ImageId=image_id,
         InstanceType=instance_type,
@@ -334,6 +337,22 @@ def log_to_file(source, filename):
             log_file.write(line)
 
 
+def parse_command(command_raw, defaults={}):
+    if not command_raw:
+        command_raw = ""
+    commands_processed = []
+    parameter_map = copy.copy(defaults)
+    for seg in command_raw.split(","):
+        if ":" in seg:
+            parameters = seg.split(":")
+            parameter_map[parameters[0]] = parameters[1]
+        else:
+            commands_processed.append(seg)
+    for key, val in parameter_map.iteritems():
+        commands_processed.append("--" + key + " " + str(val))
+    return " ".join(commands_processed)
+
+
 def create_trainers(kickoff_cmd, pserver_endpoints_str):
     def create_and_start_trainer(trainer_index):
         logging.info("trainer " + str(trainer_index) + " is starting")
@@ -361,7 +380,7 @@ def create_trainers(kickoff_cmd, pserver_endpoints_str):
             TRAINER_INDEX=str(trainer_index),
             TASK_NAME=args.task_name,
             TRAINER_COUNT=args.trainer_count,
-            COMMAND=args.trainer_command,
+            COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
             MASTER_ENDPOINT=args.master_server_ip + ":" +
             str(args.master_server_port))
         logging.info(cmd)
@@ -476,7 +495,7 @@ def kickoff_pserver(host, pserver_endpoints_str):
             DOCKER_IMAGE=args.docker_image,
             PSERVER_PORT=args.pserver_port,
             TASK_NAME=args.task_name,
-            COMMAND=args.pserver_command,
+            COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
             TRAINER_COUNT=args.trainer_count,
             TRAINER_INDEX=0,
             # there is no way to use 0.0.0.0:port to start pserver
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
index 2612856d1e6273fe2642f82e8c616eb9ff24f8a4..8d7f9e84c768b096537c92a448a117d91903f25b 100644
--- a/tools/aws_benchmarking/server/pserver.sh.template
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -1,2 +1,2 @@
 #!/bin/bash
-docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
index a4b2876b08cdf05e90e50589f897d74ca5f90443..9b0aae9f7a7a879f164b380f719065302e0eb7e2 100644
--- a/tools/aws_benchmarking/server/trainer.sh.template
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -1,2 +1,2 @@
 #!/bin/bash 
-nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
index b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc..7eb040902b0f8f3cc9f7a31ec9f96467de654c3e 100644
--- a/tools/manylinux1/Dockerfile.android
+++ b/tools/manylinux1/Dockerfile.android
@@ -37,7 +37,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit