Pull origin

Merge branch 'fix-10026' of github.com:ktlichkid/Paddle into fix-10026

Pull origin
Merge branch 'fix-10026' of github.com:ktlichkid/Paddle into fix-10026
9997c916 · ktlichkid · 709a9edd · 5afc2a99 · 9997c916 · 9997c916
321 changed file
--- a/Dockerfile
+++ b/Dockerfile
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version 
+# and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@@ -46,7 +49,11 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 RUN curl -s -q https://glide.sh/get | sh
 # Install TensorRT
-# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -30,4 +30,6 @@ if(TENSORRT_FOUND)
    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
 endif()
--- a/doc/fluid/api/data.rst
+++ b/doc/fluid/api/data.rst
+==================================
+Data Reader Interface and DataSets
+==================================
+..  toctree::
+    :maxdepth: 1
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
--- a/doc/fluid/api/data/data_reader.rst
+++ b/doc/fluid/api/data/data_reader.rst
+=====================
+Data Reader Interface
+=====================
+DataTypes
+=========
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+DataFeeder
+==========
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+Reader
+======
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+minibatch
+=========
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
--- a/doc/fluid/api/data/dataset.rst
+++ b/doc/fluid/api/data/dataset.rst
+Dataset
+=======
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+mnist
+++++
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+cifar
+++++
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+conll05
+++++++
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+imdb
++++
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+imikolov
++++++++
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+movielens
+++++++++
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+sentiment
+++++++++
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+uci_housing
+++++++++++
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+wmt14
+++++
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+wmt16
+++++
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/fluid/api/data/image.rst
+++ b/doc/fluid/api/data/image.rst
+Image Interface
+===============
+..  automodule:: paddle.v2.image
+    :members:
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,17 +5,24 @@
 evaluator
 =========
-Accuracy
+ChunkEvaluator
--------
+--------------
-..  autoclass:: paddle.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
    :members:
    :noindex:
-ChunkEvaluator
+EditDistance
 --------------
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.EditDistance
    :members:
    :noindex:
+DetectionMAP
+--------------
+..  autoclass:: paddle.fluid.evaluator.DetectionMAP
+    :members:
+    :noindex:
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -16,3 +16,4 @@ Fluid
    profiler.rst
    regularizer.rst
    io.rst
+    data.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -67,8 +67,7 @@ XavierInitializer
 ..  autoclass:: paddle.fluid.initializer.XavierInitializer
    :members:
    :noindex:
-    MSRA
-    ------
 MSRAInitializer
 -----------------

--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,10 +47,51 @@ DecayedAdagrad
    :members:
    :noindex:
+SGDOptimizer
+------------
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+    :members:
+    :noindex:
+MomentumOptimizer
+-----------------
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+    :members:
+    :noindex:
+AdagradOptimizer
+----------------
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+AdamOptimizer
+-------------
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+AdamaxOptimizer
+---------------
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+DecayedAdagradOptimizer
+-----------------------
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+    :members:
+    :noindex:
 Adadelta
 --------------
 ..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
    :members:
    :noindex:
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -25,3 +25,16 @@ L2Decay
    :members:
    :noindex:
+L1DecayRegularizer
+---------------------
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+    :members:
+    :noindex:
+L2DecayRegularizer
+---------------------
+..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+    :members:
+    :noindex:
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -49,9 +49,9 @@ In the new design, we propose to create a new operation for averaging parameter
 - the optimizer
 - the window_size to keep the updates
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 ### Python API implementation for ParameterAverageOptimizer
@@ -59,8 +59,8 @@ Based on Polyak and Juditsky (1992), we can generalize the averaging of updates
 - Any optimizer (RMSProp , AdaGrad etc.)
 - A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
 #### Creation of the ParameterAverageOptimizer operator
 There are two ways for creating the ParameterAverageOptimizer op:
@@ -71,4 +71,4 @@ The proposal is to add the op immediately while building the computation graph.
 #### High-level API
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -113,7 +113,7 @@ if (cond) {
 ```
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
 ```python
 import paddle as pd
@@ -140,7 +140,7 @@ The difference is that variables in the C++ program contain scalar values, where
 ### Blocks with `for` and `RNNOp`
-The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
 ```python
 x = sequence([10, 20, 30]) # shape=[None, 1]

--- a/doc/fluid/design/concepts/executor.md
+++ b/doc/fluid/design/concepts/executor.md
 # Executor Design Doc
 ## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
 [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.

--- a/doc/fluid/design/concepts/program.md
+++ b/doc/fluid/design/concepts/program.md
@@ -4,7 +4,7 @@
 A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
 ```python
 x = layer.data("images")

--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
 # Design Doc: Concurrent Programming with Fluid
-With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
 Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
@@ -28,19 +28,19 @@ The following table compares concepts in Fluid and Go
 <tr>
 <td>control-flow and built-in functions </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
 <td></td>
 </tr>
 <tr>
 <td>goroutines, channels </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
 <td></td>
 </tr>
 <tr>
 <td>runtime </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
 <td></td>
 </tr>
 </tbody>
@@ -78,7 +78,7 @@ message ProgramDesc {
 }
 ```
-Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
 The default `main` function is defined as follows:
@@ -146,7 +146,7 @@ An explanation of the above program:
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
@@ -175,7 +175,7 @@ where
  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
  2. once a connection is established,
     1. creates a scope of two parameters, "input" and "output",
-     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
 ## Summarization

--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -177,7 +177,7 @@ The local training architecture will be the same as the distributed training arc
 ### Training Data
 In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
+with [data reader](./README.md) from Python. This approach is
 no longer efficient when training distributedly since the Python
 process no longer runs on the same node with the trainer processes,
 the Python reader will need to read from the distributed filesystem

--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -65,7 +65,7 @@ For embedding layers, the gradient may have many rows containing only 0 when tra
 if the gradient uses a dense tensor to do parameter optimization,
 it could spend unnecessary memory, slow down the calculations and waste
 the bandwidth while doing distributed training.
-In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:

--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
 </p>
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
--- a/doc/fluid/design/dynamic_rnn/rnn_design_en.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, 
+which supports the variable length sequences without padding. 
+This doc will design fluid's RNN based on this idea.
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+Or more clearly defined here
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+The definition of `LODTensor` is as follows:
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+1. Implement the transfer as `shared_ptr`
+    - Do not modify the contents of `lod_start_pos` as a consumer
+    - Modify producer of `lod_start_pos` as producer
+    - Conventions consumer only needs to copy `shared_ptr` passed over
+    - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+    - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+For example, the original input:
+```
+origin:
+xxxx
+xx
+xxx
+-> sorted:
+xxxx
+xxx
+xx
+```
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+In order to track the changes before and after sorting, use here
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+std::vector<SortedSeqItem> sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+## SegmentInputs
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
@@ -9,7 +9,7 @@
  concepts/index_cn.rst
  data_type/index_cn.rst
  memory/index_cn.rst
-  muti_devices/index_cn.rst
+  multi_devices/index_cn.rst
  dynamic_rnn/index_cn.rst
  concurrent/index_cn.rst
  algorithm/index_cn.rst

--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
@@ -9,7 +9,7 @@ Design
  concepts/index_en.rst
  data_type/index_en.rst
  memory/index_en.rst
-  muti_devices/index_en.rst
+  multi_devices/index_en.rst
  dynamic_rnn/index_en.rst
  concurrent/index_en.rst
  algorithm/index_en.rst

--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -36,7 +36,7 @@ Please be aware that these Python classes need to maintain some construction-tim
 ### Program
-A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
 Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
@@ -70,7 +70,7 @@ class Program(objects):
 ### Block
-A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
 1. a map from variable names to an instance of the Python `Variable` class, and
 1. a list of `Operator` instances.

--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -32,9 +32,9 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
 ### Computation Graph
@@ -48,7 +48,7 @@ The Python API will modify this computation graph to add regularization operator
 ### Python API implementation for Regularization
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
@@ -63,4 +63,4 @@ Since we want to create the regularization ops in a lazy manner, the regularizat
 #### High-level API
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -23,7 +23,7 @@ func paddlepaddle() {
 }
 ```
-This program consists of a [block](block.md) of three operators --
+This program consists of a [block](../concepts/block.md) of three operators --
 `read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
 the following
@@ -39,7 +39,7 @@ message ProgramDesc {
  }
 }
 ```
 ## Transpilers
 We can write a transpiler program that takes a `ProgramDesc`, e.g.,
@@ -93,7 +93,7 @@ specific hardware platform, for example, the `mult` operator, the
 generated code should call its CUDA kernel:
 ```c++
-paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
                               const paddle::Tensor& b) {
  paddle::Tensor t;
  paddle::operator::Mult m(a, b, ...);
@@ -107,4 +107,4 @@ where `cuda_context` could be a global variable of type
 ## Multi-Block Code Generation
 Most Fluid application programs may have more than one blocks.  To
-execute them, we need to trace [scopes](scope.md).
+execute them, we need to trace [scopes](../concepts/scope.md).
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -11,7 +11,7 @@ The goals of refactoring include:
 1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
 1. Users write Python programs to describe the graphs and run them (locally or remotely).
@@ -28,7 +28,7 @@ The goals of refactoring include:
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
 ## Description and Realization of Computation Graph
@@ -48,16 +48,16 @@ At runtime, the C++ program realizes the graph and runs it.
 <tr>
 <td>Data</td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
 </tr>
 <tr>
 <td>Operation </td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
 <td>
-<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
 </tr>
 <tr>
 <td>Block </td>
@@ -85,7 +85,7 @@ The word *graph* is interchangeable with *block* in this document.  A graph cons
 1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
-   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
      1. a scope is similar to the stack frame in programming languages,
@@ -195,7 +195,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 ## Related Concepts
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
 ### Register Macros
 ```cpp
@@ -236,7 +236,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
 	* All operations on `Tensor` are written in `Operator` or global functions.
-	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
 * `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
 	* `step_scopes` in RNN is a variable and not a tensor.
 * `Scope` is where variables are stored.

--- a/doc/fluid/design/muti_devices/index_cn.rst
+++ b/doc/fluid/design/muti_devices/index_cn.rst
--- a/doc/fluid/design/muti_devices/index_en.rst
+++ b/doc/fluid/design/muti_devices/index_en.rst
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
 # Kernel Hint Design
 ## Problem
-In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 In the current design, we use KernelType to describe one kernel.
@@ -14,7 +14,7 @@ struct KernelType {
 ```
 `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
-The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
 So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.

--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
--- a/doc/fluid/design/muti_devices/operator_kernel_type.md
+++ b/doc/fluid/design/muti_devices/operator_kernel_type.md
@@ -8,7 +8,7 @@ struct OpKernelType {
  proto::DataType data_type_;
 };
 ```
-For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
 It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.

--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -11,7 +11,7 @@ In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` imp
 There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
-During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
 For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.

--- a/doc/fluid/design/onnx/images/project_structure.png
+++ b/doc/fluid/design/onnx/images/project_structure.png
--- a/doc/fluid/design/onnx/onnx_convertor.md
+++ b/doc/fluid/design/onnx/onnx_convertor.md
+# Background
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+# How it works
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+Here are a few major considerations when it comes to converting models:
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+   1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+   2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+# Project structure
+<p align="center">
+<img src="./images/project_structure.png"/>
+</p>
+The project contains four important parts:
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+* Convert Fluid inference model to ONNX binary model
+    ```
+    python convert.py --fluid_model <fluid inference model> --onnx_model <ONNX model> validate True
+    ```
+* Validate the converted model
+    ```
+    python validate.py --fluid_model <fluid inference model> --onnx_model <ONNX model>
+    ```
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+# Challenges and mitigation
+## Cycles
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+## Sequences
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+## Ops that can't easily be mapped
+There are ops that just aren't possible to map today:
+**Control flow operators**
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+**Non-existent in Fluid**
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+**Concurrency**
+ONNX does not have any considerations for concurrency right now.
+*Resolution*: There are two ways to approach this:
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+**Overloaded in Fluid**
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+*Resolution*: Chain multiple Paddle operators.
+## Lack of LoDTensors
+As stated above, ONNX only supports simple Tensor values.
+*Resolution*: Deprecate to plain old numpy-able tensors.
+## Reconstruction from deprecated ONNX ops
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+# Supported models
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
--- a/doc/fluid/dev/contribute_to_paddle_cn.md
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
--- a/doc/fluid/dev/contribute_to_paddle_en.md
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,8 @@
 .. toctree::
  :maxdepth: 1
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
  api_doc_std_cn.md
  new_op_cn.md
  new_op_kernel.md

--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,8 @@ Development
 .. toctree::
  :maxdepth: 1
+  contribute_to_paddle_en.md
+  write_docs_en.md
  api_doc_std_en.md
  new_op_en.md
  new_op_kernel.md

--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -4,7 +4,7 @@ To make the operator document itself more clear, we recommend operator names obe
 ## OpProtoMaker names
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
 - Input/Output.
  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.

--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -54,10 +54,10 @@
 </table>
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 ## 实现C++类
@@ -85,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
-上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
 ```cpp
 template <typename AttrType>
@@ -103,21 +103,21 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
-    AddComment(R"DOC(Scale operator
+    AddComment(R"DOC(
-The equation is: Out = scale*X
+Scale operator
+$$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
  }
 };
 ```
-这个例子有两处不同：
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 ### 定义Operator类
@@ -147,7 +147,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -173,7 +173,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 - `typename T` : 表示数据类型，如`float`, `double`等。
@@ -201,10 +201,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
-`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
@@ -215,7 +214,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    ```cpp
    namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -223,8 +224,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
   在上面的代码中：
-    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
-    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
@@ -255,7 +255,7 @@ make mul_op
 ## 实现单元测试
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
 ### 前向Operator单测
@@ -315,7 +315,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 ### 编译和执行
-`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
@@ -331,7 +331,6 @@ ctest -R test_mul_op
 ## 注意事项
- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -26,13 +26,6 @@ Here are the base types needed. For details, please refer to the design docs.
 Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
- Information           | Where is it defined
--------------  | :----------------------
-OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
-Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 <table>
 <thead>
 <tr>
@@ -61,10 +54,10 @@ Registering the Op           | Ops are registered in `.cc` files; For Kernel reg
 </table>
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 ## Implementing C++ Types
@@ -92,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
   - `framework::OpAttrChecker` is used to validate variable attributes.
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
 ```cpp
 template <typename AttrType>
@@ -120,11 +113,7 @@ The equation is: Out = scale*X
 };
 ```
-There are two changes in this example:
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
 ### Defining Operator
@@ -154,7 +143,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -180,7 +169,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
 - `typename T` denotes data type, such as `float` or `double`.
@@ -209,9 +198,9 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@@ -224,7 +213,9 @@ The definition of its corresponding backward operator, if applicable, is similar
    ```cpp
    namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
@@ -233,9 +224,8 @@ The definition of its corresponding backward operator, if applicable, is similar
   In that code block,
-    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
@@ -275,7 +265,7 @@ Unit tests for an operator include
 3. a scaling test for the backward operator.
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
 ### Testing Forward Operators
@@ -339,7 +329,7 @@ Some key points in checking gradient above include:
 ### Compiling and Running
-Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
 Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
@@ -357,7 +347,6 @@ ctest -R test_mul_op
 ## Remarks
- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
 - If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
--- a/doc/fluid/dev/new_op_kernel.md
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -4,13 +4,13 @@
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
-[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
 ## Write Kernels for A New Device
 ### Add A New Device
-  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
 To register a new device, we need to add an enum value to `LibraryType`:
@@ -23,9 +23,9 @@ enum class LibraryType {
 ```
-### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
-If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
 ```cpp
 struct CUDAPlace {
@@ -45,8 +45,8 @@ struct CUDAPlace {
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 ```
-### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
-After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
 ```cpp
 class DeviceContext {
@@ -58,9 +58,9 @@ class DeviceContext {
 };
 ```
-### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
-A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
 ```cpp
 class OpKernelBase {
@@ -101,7 +101,7 @@ REGISTER_OP_KERNEL(
 kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
-take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
 	```cpp
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,

--- a/doc/fluid/dev/support_new_device.md
+++ b/doc/fluid/dev/support_new_device.md
@@ -13,7 +13,7 @@ So, how to support a new Device/Library in Fluid becomes a challenge.
 ## Basic: Integrate A New Device/Library
-For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
 There are mainly three parts that we have to consider while integrating a new device/library:
@@ -28,7 +28,7 @@ There are mainly three parts that we have to consider while integrating a new de
 Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
 ```
        |   CPUPlace
@@ -44,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
 #### DeviceContext
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
 ```
@@ -73,7 +73,7 @@ class CUDADeviceContext : public DeviceContext {
  Place GetPlace() const override { return place_; }
 private:
  CUDAPlace place_;
-  cudaStream_t stream_; 
+  cudaStream_t stream_;
  cublasHandle_t cublas_handle_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
 };
@@ -84,7 +84,7 @@ private:
 #### memory module
-Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
 ```
 template <typename Place>
@@ -102,7 +102,7 @@ To implement these interfaces, we have to implement MemoryAllocator for differen
 #### Tensor
-[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
 ```cpp
 class Tensor {
@@ -161,7 +161,7 @@ t.mutable_data(place);
 Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
-Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
 The interface is defined in the header file.
@@ -210,7 +210,7 @@ The implementation of `OpKernel` is similar to math functors, the extra thing we
 Fluid provides different register interfaces in op_registry.h
-Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
 In .cc file:
@@ -236,5 +236,5 @@ Generally, we will implement OpKernel for all Device/Library of an Operator. We
 For more details, please refer to following docs:
- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
--- a/doc/fluid/dev/write_docs_cn.rst
+++ b/doc/fluid/dev/write_docs_cn.rst
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
--- a/doc/fluid/dev/write_docs_en.rst
+++ b/doc/fluid/dev/write_docs_en.rst
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
-../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
+快速开始
+========
+快速安装
+--------
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+  .. code-block:: bash
+     pip install paddlepaddle
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+更详细的安装和编译方法参考： :ref:`install_steps` 。
+快速使用
+--------
+创建一个 housing.py 并粘贴此Python代码：
+  .. code-block:: python
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model 
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program, 
+                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97 
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
-../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
+Quick Start
+============
+Quick Install
+-------------
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+  .. code-block:: bash
+     pip install paddlepaddle
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+For more details about installation and build: :ref:`install_steps` .
+Quick Use
+---------
+Create a new file called housing.py, and paste this Python
+code:
+  .. code-block:: python
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model 
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program, 
+                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97 
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -6,6 +6,7 @@ PaddlePaddle adheres to the following three sections of code and document specif
 PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
 ..  toctree::
  :maxdepth: 1

--- a/doc/v2/faq/local/index_en.rst
+++ b/doc/v2/faq/local/index_en.rst
 #############################
-Local Training and Prediction
+Parameter Setting
 #############################
-TBD
+..  contents::
+1. Reduce Memory Consumption
+-------------------
+The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes:
+\:
+* Cache memory for DataProvider (only on host memory),
+* Memory for neurons' activation information (on both host memory and device memory),
+* Memory for parameters (on both host memory and device memory),
+* Other memory demands.
+Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation，temporary variables, which are not considered currently.
+Reduce DataProvider Cache Memory
++++++++++++++++++++++++++
+PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory:
+..  graphviz::
+    digraph {
+        rankdir=LR;
+        Data Files -> Host Memory Pool -> PaddlePaddle Training
+    }
+Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle，which means a shuffle operation is needed before each data ﬁle reading process to ensure the randomness of data when try to reduce the size of the memory pool.
+..  literalinclude:: src/reduce_min_pool_size.py
+In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`.
+The Neurons Activation Memory
++++++++++++++
+Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training.
+Two practical ways:
+* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself.
+* Shorten the sequence length or cut oﬀ those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM.
+The Parameters Memory
++++++++
+The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory
+space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model ﬁle contains all
+parameters needs :code:`100M`.
+Some optimization algorithms such as :code:`momentum` are worth giving a shot.
+2. Tricks To Speed Up Training
+-------------------
+The training procedure of PaddlePaddle may be speed up when considering following aspects:\：
+* Reduce the time consumption of data loading
+* Speed up training epochs
+* Introduce more computing resources with the utilization of distribute training frameworks
+Reduce The Time Consumption of Data Loading
++++++++++++++++++
+The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool.
+..  literalinclude:: src/reduce_min_pool_size.py
+Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process.
+Accelerating Training Epochs
++++++++++++
+Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`.
+Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is:
+..  literalinclude:: src/word2vec_dataprovider.py
+The configuration of this task is:
+..  literalinclude:: src/word2vec_config.py
+Introduce More Computing Resources
++++++++++++++++++
+More computing resources can be introduced with following manners:
+* Single CPU platform training
+  * Use multi-threading by set :code:`trainer_count`。
+* Single GPU platform training
+  * Set :code:`use_gpu` to train on single GPU.
+  * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support.
+* Cluster Training
+  * Refer to :ref:`cluster_train` 。
+3. Assign GPU Devices
+------------------
+Assume a computing platform consists of 4 GPUs which serial number from 0 to 3:
+* Method1: specify a GPU as computing device by set:
+ `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_
+..      code-block:: bash
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+* Method2: Assign by —gpu_id:
+..      code-block:: bash
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training.
+------------------------------------------------------------------------
+Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception:
+* Parameters or gradients during training are oversize, which leads to float overflow during calculation.
+* The model failed to converge and diverges to a big value.
+* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication.
+Two ways to solve this problem:
+1. Set :code:`gradient_clipping_threshold` as:
+..  code-block:: python
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+Details can refer to example `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+2. Set :code:`error_clipping_threshold` as:
+..  code-block:: python
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+Details can refer to example `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+The main difference between these two methods are:
+1. They both block the gradient, but happen in different occasions，the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions.
+2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers.
+Moreover, Such problems may be fixed with smaller learning rates or data normalization.
+5.  Fetch Multi Layers’ Prediction Result With Infer Interface
+-----------------------------------------------
+* Join the layer to be used as :code:`output_layer` layer to the input parameters of  :code:`paddle.inference.Inference()` interface with:
+..  code-block:: python
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+* Assign certain ﬁelds to output. Take :code:`value` as example, it can be down with following code:
+..  code-block:: python
+    out = inferer.infer(input=data_batch, field=["value"])
+It is important to note that:
+* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes.
+* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2；
+* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error:
+..      code-block:: python
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+The transverse of diﬀerent matrixes of multi layers mainly happens when:
+* Output sequence layer and non sequence layer;
+* Multiple output layers process multiple sequence with different length;
+Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which
+* The number of elements equals to the number of output layers in the network;
+* Each element in list is a result matrix of a layer, which type is numpy.ndarray;
+* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration.
+6.  Fetch the Output of A Certain Layer During Training
+-----------------------------------------------
+In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch.
+The output can be used in custom measurements in following way:
+..      code-block:: python
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of  :code:`paddle.layer.recurrent_group` can be fetched.
+7.  Fetch Parameters’ Weight and Gradient During Training
+-----------------------------------------------
+Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following:
+..      code-block:: python
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of inﬂuence on training performance. Don’t use these two functions when the training procedure cares about the performance.
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
-k8s_aws_en.md
\ No newline at end of file
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -134,7 +134,7 @@
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 示例3对于单层RNN和双层RNN数据完全相同。

--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+..  _algo_hrnn_rnn_api_compare:
+#####################
 API comparision between RNN and hierarchical RNN
-================================================
+#####################
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+Reading hierarchical sequence data
+----------------
+Firstly, the original data in this example is as follows \:
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+Model configuration
+------------------------------------------
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+model configuration
+------------------
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+- hierarchical RNN, the outer layer's memory is an element. 
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+.. role:: red
+.. raw:: html
+    <style> .red {color:red} </style>
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+model configuration
+--------
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+* single-layer RNN\:
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+* hierarchical RNN\ \:
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+Glossary
+======
+..  _glossary_memory:
+Memory
+------
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+..  graphviz:: src/glossary_rnn.dot
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+..  graphviz:: src/glossary_rnn_with_memory.dot
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+..  _glossary_timestep:
+time step
+------
+refers to time series
+..  _glossary_sequence:
+time series
+--------
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+..  _glossary_RNN:
+RNN
+---
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+..  _glossary_hierarchical_RNN:
+hierarchical RNN
+-------
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
-TBD
--- a/doc/v2/howto/rnn/index_en.rst
+++ b/doc/v2/howto/rnn/index_en.rst
 RNN Models
 ==========
+Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts.
+In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually.
 ..  toctree::
  :maxdepth: 1
  rnn_config_en.rst
+Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it.
+..  toctree::
+  :maxdepth: 1
  recurrent_group_en.md
+In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively.
+..  toctree::
+  :maxdepth: 1
  hierarchical_layer_en.rst
+In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test.
+..  toctree::
+  :maxdepth: 1
  hrnn_rnn_api_compare_en.rst
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
    return;
  }
+  need_update_ = true;
  ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }

--- a/paddle/fluid/framework/blocking_queue.h
+++ b/paddle/fluid/framework/blocking_queue.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include <utility>
+namespace paddle {
+namespace framework {
+template <typename T>
+class BlockingQueue {
+ public:
+  void Push(const T &item) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      q_.emplace_back(item);
+    }
+    cv_.notify_one();
+  }
+  template <typename U>
+  void Extend(const U &items) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      for (auto &item : items) {
+        q_.emplace_back(item);
+      }
+    }
+    cv_.notify_all();
+  }
+  std::deque<T> PopAll(size_t ms, bool *timeout) {
+    auto time =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
+    std::unique_lock<std::mutex> lock(mutex_);
+    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
+    std::deque<T> ret;
+    if (!*timeout) {
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+  T Pop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [=] { return !q_.empty(); });
+    T rc(std::move(q_.front()));
+    q_.pop_front();
+    return rc;
+  }
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::deque<T> q_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thread>
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -40,10 +40,10 @@ namespace paddle {
 namespace framework {
 template <typename T>
-LoDTensor *CreateVariable(Scope &scope, p::CPUPlace &place, std::string name,
+LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
-                          T value) {
+                          std::string name, T value) {
  // Create LoDTensor<int> of dim [1]
-  auto var = scope.Var(name);
+  auto var = scope->Var(name);
  auto tensor = var->GetMutable<LoDTensor>();
  tensor->Resize({1});
  T *expect = tensor->mutable_data<T>(place);
@@ -77,9 +77,9 @@ void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
  func(caseBlock, scope);
-  CreateVariable(*scope, *place, caseCondName, false);
+  CreateVariable(scope, *place, caseCondName, false);
-  CreateVariable(*scope, *place, caseCondXVarName, caseId);
+  CreateVariable(scope, *place, caseCondXVarName, caseId);
-  CreateVariable(*scope, *place, caseVarName, caseId);
+  CreateVariable(scope, *place, caseVarName, caseId);
  scope->Var("step_scope");
@@ -96,21 +96,21 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
                        std::string quitChanName) {
  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
-  CreateVariable(*scope, *place, "whileExitCond", true);
+  CreateVariable(scope, *place, "whileExitCond", true);
-  CreateVariable(*scope, *place, "caseToExecute", -1);
+  CreateVariable(scope, *place, "caseToExecute", -1);
-  CreateVariable(*scope, *place, "case1var", 0);
+  CreateVariable(scope, *place, "case1var", 0);
-  CreateVariable(*scope, *place, "xtemp", 0);
+  CreateVariable(scope, *place, "xtemp", 0);
  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
  // data,
  // which causes the data to be no longer accessible to do the fib calculation
  // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(*scope, *place, "fibXToSend", 0);
+  CreateVariable(scope, *place, "fibXToSend", 0);
-  CreateVariable(*scope, *place, "fibX", 0);
+  CreateVariable(scope, *place, "fibX", 0);
-  CreateVariable(*scope, *place, "fibY", 1);
+  CreateVariable(scope, *place, "fibY", 1);
-  CreateVariable(*scope, *place, "quitVar", 0);
+  CreateVariable(scope, *place, "quitVar", 0);
  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
@@ -138,7 +138,7 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
    // Exit the while loop after we receive from quit channel.
    // We assign a false to "whileExitCond" variable, which will
    // break out of while_op loop
-    CreateVariable(*scope, *place, "whileFalse", false);
+    CreateVariable(scope, *place, "whileFalse", false);
    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
          caseBlock);
  };
@@ -174,9 +174,9 @@ TEST(Concurrency, Go_Op) {
  // Create Variables, x0 will be put into channel,
  // result will be pulled from channel
-  CreateVariable(scope, place, "Status", false);
+  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(scope, place, "x0", 99);
+  CreateVariable(&scope, place, "x0", 99);
-  CreateVariable(scope, place, "result", 0);
+  CreateVariable(&scope, place, "result", 0);
  framework::Executor executor(place);
  ProgramDesc program;
@@ -226,9 +226,9 @@ TEST(Concurrency, Select) {
  // Initialize scope variables
  p::CPUDeviceContext ctx(place);
-  CreateVariable(scope, place, "Status", false);
+  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(scope, place, "result", 0);
+  CreateVariable(&scope, place, "result", 0);
-  CreateVariable(scope, place, "currentXFib", 0);
+  CreateVariable(&scope, place, "currentXFib", 0);
  framework::Executor executor(place);
  ProgramDesc program;
@@ -246,7 +246,7 @@ TEST(Concurrency, Select) {
        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
  // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(scope, place, "xReceiveVar", 0);
+  CreateVariable(&scope, place, "xReceiveVar", 0);
  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
  for (int i = 0; i < 10; ++i) {
@@ -264,7 +264,7 @@ TEST(Concurrency, Select) {
          goOpBlock);
  }
-  CreateVariable(scope, place, "quitSignal", 0);
+  CreateVariable(&scope, place, "quitSignal", 0);
  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
        {{"Status", {"Status"}}}, {}, goOpBlock);

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
 }
 TEST(Operator, CPUtoGPU) {
-  using namespace paddle::framework;
+  paddle::framework::InitDevices(true);
-  using namespace paddle::platform;
-  InitDevices(true);
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace cpu_place;
@@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
  // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
-  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  auto* src_ptr =
+      in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
  for (int i = 0; i < 2 * 3; ++i) {
    src_ptr[i] = static_cast<float>(i);
  }
@@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
  auto* output = scope.Var("OUT1");
  cpu_op->Run(scope, cpu_place);
-  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
  for (int i = 0; i < 2 * 3; ++i) {
    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
  }
@@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
  VLOG(3) << "after gpu_op run";
  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
  auto dev_ctx = pool.Get(cuda_place);
  paddle::framework::Tensor output_tensor;
-  TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+  paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
-             &output_tensor);
+                                paddle::platform::CPUPlace(), *dev_ctx,
+                                &output_tensor);
  dev_ctx->Wait();
  float* output2_ptr = output_tensor.data<float>();

--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cctype>
 #include <ostream>
+#include <string>
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include <vector>
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"

--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -18,27 +18,28 @@
 #include "paddle/fluid/platform/device_context.h"
 TEST(DataTransform, DataLayoutFunction) {
-  using namespace paddle::framework;
+  auto place = paddle::platform::CPUPlace();
-  using namespace paddle::platform;
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  paddle::framework::Tensor out = paddle::framework::Tensor();
-  auto place = CPUPlace();
+  in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
-  Tensor in = Tensor();
+  in.set_layout(paddle::framework::DataLayout::kNHWC);
-  Tensor out = Tensor();
-  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  auto kernel_nhwc = paddle::framework::OpKernelType(
-  in.set_layout(DataLayout::kNHWC);
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNHWC,
-  auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
+      paddle::framework::LibraryType::kPlain);
-                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = paddle::framework::OpKernelType(
-  auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
+      paddle::framework::proto::VarType::FP32, place,
-                                  DataLayout::kNCHW, LibraryType::kPlain);
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kPlain);
-  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+  paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
-  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -63,16 +63,16 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 }
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var) {
+                            Variable* out_var) {
  if (in_var.IsType<LoDTensor>()) {
    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
    tran_lod_tensor->set_lod(in_lod_tensor.lod());
    tran_lod_tensor->set_layout(in_lod_tensor.layout());
    tran_lod_tensor->ShareDataWith(tensor);
  } else if (in_var.IsType<SelectedRows>()) {
    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
    trans_selected_rows->set_height(in_selected_rows.height());
    trans_selected_rows->set_rows(in_selected_rows.rows());
    trans_selected_rows->mutable_value()->ShareDataWith(tensor);

--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -35,7 +35,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
                   const Tensor& input_tensor, Tensor* out);
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var);
+                            Variable* out_var);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -22,18 +23,21 @@ namespace paddle {
 namespace framework {
 inline proto::VarType::Type ToDataType(std::type_index type) {
-  using namespace paddle::framework::proto;
  if (typeid(platform::float16).hash_code() == type.hash_code()) {
    return proto::VarType::FP16;
-  } else if (typeid(float).hash_code() == type.hash_code()) {
+  } else if (typeid(const float).hash_code() == type.hash_code()) {
+    // CPPLint complains Using C-style cast.  Use static_cast<float>() instead
+    // One fix to this is to replace float with const float because
+    // typeid(T) == typeid(const T)
+    // http://en.cppreference.com/w/cpp/language/typeid
    return proto::VarType::FP32;
-  } else if (typeid(double).hash_code() == type.hash_code()) {
+  } else if (typeid(const double).hash_code() == type.hash_code()) {
    return proto::VarType::FP64;
-  } else if (typeid(int).hash_code() == type.hash_code()) {
+  } else if (typeid(const int).hash_code() == type.hash_code()) {
    return proto::VarType::INT32;
-  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
    return proto::VarType::INT64;
-  } else if (typeid(bool).hash_code() == type.hash_code()) {
+  } else if (typeid(const bool).hash_code() == type.hash_code()) {
    return proto::VarType::BOOL;
  } else {
    PADDLE_THROW("Not supported");
@@ -41,7 +45,6 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 }
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
  switch (type) {
    case proto::VarType::FP16:
      return typeid(platform::float16);
@@ -62,7 +65,6 @@ inline std::type_index ToTypeIndex(proto::VarType::Type type) {
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  using namespace paddle::framework::proto;
  switch (type) {
    case proto::VarType::FP16:
      visitor.template operator()<platform::float16>();
@@ -88,7 +90,6 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 }
 inline std::string DataTypeToString(const proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
  switch (type) {
    case proto::VarType::FP16:
      return "float16";

--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <utility>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"

--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -17,43 +17,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 TEST(DataTypeTransform, CPUTransform) {
-  using namespace paddle::framework;
+  auto place = paddle::platform::CPUPlace();
-  using namespace paddle::platform;
+  auto kernel_fp16 = paddle::framework::OpKernelType(
-  auto place = CPUPlace();
+      paddle::framework::proto::VarType::FP16, place,
+      paddle::framework::DataLayout::kAnyLayout,
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
+      paddle::framework::LibraryType::kPlain);
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
+  auto kernel_fp32 = paddle::framework::OpKernelType(
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::proto::VarType::FP32, place,
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
+      paddle::framework::DataLayout::kAnyLayout,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = paddle::framework::OpKernelType(
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
+      paddle::framework::proto::VarType::FP64, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::DataLayout::kAnyLayout,
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
+      paddle::framework::LibraryType::kPlain);
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
  // data type transform from float32
  {
-    Tensor in;
+    paddle::framework::Tensor in;
-    Tensor out;
+    paddle::framework::Tensor out;
-    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
    int data_number = 2 * 3;
    for (int i = 0; i < data_number; ++i) {
      ptr[i] = i / 3;
    }
-    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
    }
-    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
  // data type transform from/to float16
  {
-    Tensor in;
+    paddle::framework::Tensor in;
-    Tensor out;
+    paddle::framework::Tensor out;
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), place);
    int data_number = 2 * 3;
    for (int i = 0; i < data_number; ++i) {
@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
    }
    // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
    float* out_data_float = out.data<float>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
    double* out_data_double = out.data<double>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
    int* out_data_int = out.data<int>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
    int64_t* out_data_int64 = out.data<int64_t>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
    bool* out_data_bool = out.data<bool>();
    for (int i = 0; i < data_number; ++i) {
      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
    }
    // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
    for (int i = 0; i < data_number; ++i) {
      in_data_float[i] = i;
    }
-    TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
    }
    // transform double to float16
-    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
    for (int i = 0; i < data_number; ++i) {
      in_data_double[i] = i;
    }
-    TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
    }
    // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
    for (int i = 0; i < data_number; ++i) {
      in_data_int[i] = i;
    }
-    TransDataType(kernel_int32, kernel_fp16, in, &out);
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
    }
    // transform int64 to float16
-    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
    for (int i = 0; i < data_number; ++i) {
      in_data_int64[i] = i;
    }
-    TransDataType(kernel_int64, kernel_fp16, in, &out);
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
    }
    // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
    for (int i = 0; i < data_number; ++i) {
      in_data_bool[i] = i;
    }
-    TransDataType(kernel_bool, kernel_fp16, in, &out);
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
    }
  }
 }
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -18,42 +18,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 TEST(DataTypeTransform, GPUTransform) {
-  using namespace paddle::framework;
+  auto cpu_place = paddle::platform::CPUPlace();
-  using namespace paddle::platform;
+  auto gpu_place = paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
-  auto cpu_place = CPUPlace();
-  auto gpu_place = CUDAPlace(0);
+  auto kernel_fp16 = paddle::framework::OpKernelType(
-  CUDADeviceContext context(gpu_place);
+      paddle::framework::proto::VarType::FP16, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
+      paddle::framework::LibraryType::kPlain);
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
+  auto kernel_fp32 = paddle::framework::OpKernelType(
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::proto::VarType::FP32, gpu_place,
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = paddle::framework::OpKernelType(
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
+      paddle::framework::proto::VarType::FP64, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+      paddle::framework::DataLayout::kAnyLayout,
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
+      paddle::framework::LibraryType::kPlain);
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
  // data type transform from float32
  {
-    Tensor in;
+    paddle::framework::Tensor in;
-    Tensor in_gpu;
+    paddle::framework::Tensor in_gpu;
-    Tensor out_gpu;
+    paddle::framework::Tensor out_gpu;
-    Tensor out;
+    paddle::framework::Tensor out;
-    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
    float arr[6] = {0, 1, 2, 3, 4, 5};
    int data_number = sizeof(arr) / sizeof(arr[0]);
    memcpy(in_ptr, arr, sizeof(arr));
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    double* out_data_double = out.data<double>();
@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
    }
-    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    int* out_data_int = out.data<int>();
@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
  // data type transform from/to float16
  {
-    Tensor in;
+    paddle::framework::Tensor in;
-    Tensor in_gpu;
+    paddle::framework::Tensor in_gpu;
-    Tensor out_gpu;
+    paddle::framework::Tensor out_gpu;
-    Tensor out;
+    paddle::framework::Tensor out;
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
-    float16 arr[6] = {float16(0), float16(1), float16(2),
+        paddle::framework::make_ddim({2, 3}), cpu_place);
-                      float16(3), float16(4), float16(5)};
+    paddle::platform::float16 arr[6] = {
+        paddle::platform::float16(0), paddle::platform::float16(1),
+        paddle::platform::float16(2), paddle::platform::float16(3),
+        paddle::platform::float16(4), paddle::platform::float16(5)};
    int data_number = sizeof(arr) / sizeof(arr[0]);
    memcpy(ptr, arr, sizeof(arr));
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
    // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    float* out_data_float = out.data<float>();
@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    double* out_data_double = out.data<double>();
@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    int* out_data_int = out.data<int>();
@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    int64_t* out_data_int64 = out.data<int64_t>();
@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
    }
-    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
    bool* out_data_bool = out.data<bool>();
@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
    }
    // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
    for (int i = 0; i < data_number; ++i) {
      in_data_float[i] = i;
    }
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
    }
    // transform double to float16
-    double* in_data_double =
+    double* in_data_double = in.mutable_data<double>(
-        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+        paddle::framework::make_ddim({2, 3}), cpu_place);
    for (int i = 0; i < data_number; ++i) {
      in_data_double[i] = i;
    }
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
    }
    // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
    for (int i = 0; i < data_number; ++i) {
      in_data_int[i] = i;
    }
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
    }
    // transform int64 to float16
-    int64_t* in_data_int64 =
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(
-        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+        paddle::framework::make_ddim({2, 3}), cpu_place);
    for (int i = 0; i < data_number; ++i) {
      in_data_int64[i] = i;
    }
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
    }
    // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
    for (int i = 0; i < data_number; ++i) {
      in_data_bool[i] = i;
    }
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
    context.Wait();
-    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
-    TensorCopy(out_gpu, cpu_place, context, &out);
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
    context.Wait();
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
    for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
    }
  }
 }
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,29 +2,37 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda)
    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
 else()
    set(multi_devices_graph_builder_deps)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
 endif()
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
-cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
+cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context reduce_op_handle )
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,95 +13,77 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 namespace paddle {
 namespace framework {
 namespace details {
-Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
    : local_scopes_(local_scopes), places_(places) {}
 void BroadcastOpHandle::RunImpl() {
-  // the input may have dummy var.
+  // the input and output may have dummy var.
-  std::vector<VarHandle *> in_var_handle;
+  VarHandle *in_var_handle;
-  for (auto *in : inputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(in);
+  {
-    if (out_handle) {
+    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-      in_var_handle.push_back(out_handle);
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
-    }
+                      "The number of input should be one.");
-  }
+    in_var_handle = in_var_handles[0];
-  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
-                    "The number of input should be one.");
-  // the output may have dummy var.
-  std::vector<VarHandle *> out_var_handles;
-  for (auto *out : outputs_) {
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
-      out_var_handles.push_back(out_handle);
-    }
  }
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
  PADDLE_ENFORCE_EQ(
      out_var_handles.size(), places_.size(),
      "The number of output should equal to the number of places.");
-  // Wait input done, this Wait is asynchronous operation
+  // Wait input done, this Wait is asynchronous operation platform::Place
-  auto &in_place = in_var_handle[0]->place_;
+  // &in_place;
-  if (in_var_handle[0]->generated_op_) {
+  WaitInputVarGenerated(*in_var_handle);
-    for (auto *out : out_var_handles) {
-      auto &out_p = out->place_;
+  std::vector<const Scope *> var_scopes;
-      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+  for (auto *s : local_scopes_) {
-    }
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
  }
-  //
+  auto *in_var =
-  auto in_scope_idx = in_var_handle[0]->scope_idx_;
+      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
-  auto in_var =
+  PADDLE_ENFORCE_NOT_NULL(in_var);
-      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
-  Tensor *in_tensor = GetTensorFromVar(in_var);
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
  for (auto *out : out_var_handles) {
-    auto &out_p = out->place_;
+    if (*out == *in_var_handle) {
-    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+      continue;
+    }
-    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+    auto &out_p = out->place_;
+    auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
                      "Places must be all on CPU or all on CUDA.");
-    if (in_var->IsType<framework::SelectedRows>()) {
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-      auto &in_sr = in_var->Get<framework::SelectedRows>();
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
-      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
+                                                            in_tensor.type());
-      if (&in_sr == out_sr) continue;
-      out_sr->set_height(in_sr.height());
-      out_sr->set_rows(in_sr.rows());
-      out_sr->mutable_value()->Resize(in_sr.value().dims());
-      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto in_lod = in_var->Get<framework::LoDTensor>();
-      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
-      if (&in_lod == out_lod) continue;
-      out_lod->set_lod(in_lod.lod());
-      out_lod->Resize(in_lod.dims());
-      out_lod->mutable_data(out_p, in_lod.type());
-    } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
-    }
-    Tensor *out_tensor = GetTensorFromVar(out_var);
+    auto dev_ctx = dev_ctxes_.at(out_p);
-    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+    RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
-                                  out_tensor);
+      paddle::framework::TensorCopy(
+          in_tensor, out_p, *(dev_ctx),
+          &VariableVisitor::GetMutableTensor(out_var));
+    });
+  }
+}
+void BroadcastOpHandle::WaitInputVarGenerated(const VarHandle &in_var) {
+  if (in_var.generated_op_) {
+    for (auto &pair : dev_ctxes_) {
+      in_var.generated_op_->Wait(pair.second);
+    }
  }
 }

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
+ public:
-  const std::vector<platform::Place> &places_;
  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
@@ -41,8 +39,12 @@ struct BroadcastOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
-};
+  void WaitInputVarGenerated(const VarHandle &in_var);
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
 struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
  Scope g_scope_;
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -72,11 +73,17 @@ struct TestBroadcastOpHandle {
  void InitBroadcastOp(size_t input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
    }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("input");
    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
    auto* in_var_handle =
        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
@@ -90,7 +97,7 @@ struct TestBroadcastOpHandle {
    op_handle_->AddInput(dummy_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
@@ -105,7 +112,8 @@ struct TestBroadcastOpHandle {
  }
  void TestBroadcastLodTensor(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -117,6 +125,7 @@ struct TestBroadcastOpHandle {
    paddle::framework::TensorFromVector<float>(
        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
    in_lod_tensor->set_lod(lod);
+    in_lod_tensor->Resize(kDims);
    op_handle_->Run(false);
@@ -124,7 +133,8 @@ struct TestBroadcastOpHandle {
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
      auto out_tensor = out_var->Get<f::LoDTensor>();
      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
@@ -139,7 +149,8 @@ struct TestBroadcastOpHandle {
  }
  void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
    auto value = in_selected_rows->mutable_value();
    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -162,7 +173,8 @@ struct TestBroadcastOpHandle {
    p::CPUPlace cpu_place;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scopes_[j]->Var("out");
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
      auto& out_select_rows = out_var->Get<f::SelectedRows>();
      auto rt = out_select_rows.value();

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -28,8 +28,8 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
 void ComputationOpHandle::RunImpl() {
  auto *cur_ctx = dev_ctxes_[place_];
  for (auto *in : inputs_) {
-    bool need_wait =
+    bool need_wait = in->generated_op_ &&
-        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+                     in->generated_op_->DeviceContext(place_) != cur_ctx;
    if (need_wait) {
      in->generated_op_->Wait(cur_ctx);
    }

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,9 @@
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -24,10 +27,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
+ public:
-  Scope *scope_;
-  platform::Place place_;
  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
                      platform::Place place);
@@ -35,6 +35,11 @@ struct ComputationOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/container_cast.h
+++ b/paddle/fluid/framework/details/container_cast.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <type_traits>
+#include <vector>
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename ResultType, typename ElemType>
+std::vector<ResultType*> DynamicCast(const std::vector<ElemType*>& container) {
+  static_assert(std::is_base_of<ElemType, ResultType>::value,
+                "ElementType must be a base class of ResultType");
+  std::vector<ResultType*> res;
+  for (auto* ptr : container) {
+    auto* derived = dynamic_cast<ResultType*>(ptr);
+    if (derived) {
+      res.emplace_back(derived);
+    }
+  }
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -14,7 +14,7 @@
 #pragma once
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 namespace paddle {
 namespace framework {
@@ -23,7 +23,7 @@ namespace details {
 // Change it to thread safe flags if needed.
 class ThreadUnsafeOwnershipFlags {
 public:
-  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
  ThreadUnsafeOwnershipFlags& operator=(

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
    auto *var = static_cast<VarHandle *>(input);
    var->generated_op_->Wait(cpu_ctx);
  }
  tensors_.resize(inputs_.size());
-  auto *var = static_cast<VarHandle *>(inputs_[0]);
+  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var->name_;
+  auto &var_name = var_handle->name_;
  platform::CPUPlace cpu;
  auto &scopes = *local_scopes_;
  for (size_t i = 0; i < scopes.size(); ++i) {
    auto &scope = scopes[i];
-    auto &t = scope->FindVar(kLocalExecScopeName)
+    auto *var =
-                  ->Get<Scope *>()
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
-                  ->FindVar(var_name)
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                  ->Get<framework::LoDTensor>();
+                            var_name);
-    if (platform::is_gpu_place(var->place_)) {
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
-      dev_ctxes_[t.place()]->Wait();
+      dev_ctxes_.at(t.place())->Wait();
 #endif
    } else {
      tensors_[i].ShareDataWith(t);

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -14,6 +14,9 @@
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,11 +27,7 @@ namespace framework {
 namespace details {
 struct FetchOpHandle : public OpHandleBase {
-  FeedFetchList *data_;
+ public:
-  size_t offset_;
-  std::vector<Scope *> *local_scopes_;
-  std::vector<LoDTensor> tensors_;
  FetchOpHandle(FeedFetchList *data, size_t offset,
                std::vector<Scope *> *local_scopes);
@@ -42,6 +41,12 @@ struct FetchOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+ private:
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 namespace paddle {
 namespace framework {
@@ -23,46 +25,40 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
    : local_scopes_(local_scopes), places_(places) {}
 void GatherOpHandle::RunImpl() {
-  // the input may have dummy var.
+  // the input and output may have dummy var.
-  std::vector<VarHandle *> in_var_handles;
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  for (auto *in : inputs_) {
-    auto *in_handle = dynamic_cast<VarHandle *>(in);
-    if (in_handle) {
-      in_var_handles.push_back(in_handle);
-    }
-  }
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
      "The number of output should equal to the number of places.");
-  // the output may have dummy var.
+  VarHandle *out_var_handle;
-  std::vector<VarHandle *> out_var_handles;
+  {
-  for (auto *out : outputs_) {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
-    auto *out_handle = dynamic_cast<VarHandle *>(out);
-    if (out_handle) {
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-      out_var_handles.push_back(out_handle);
+                      "The number of output should be one.");
-    }
+    out_var_handle = out_var_handles.front();
  }
-  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                    "The number of output should be one.");
-  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
+  std::vector<const Scope *> var_scopes;
-  auto pre_in_var =
+  for (auto *s : local_scopes_) {
-      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
-  auto pre_place = in_0_handle->place_;
+  }
+  auto in_0_handle = in_var_handles[0];
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                 "Currently, gather_op only can gather SelectedRows.");
-  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+  auto pre_place = in_0_handle->place_;
+  PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
                    "The place of input and output should be the same.");
  // Wait input done, this Wait is asynchronous operation
-  for (auto *in : in_var_handles) {
+  WaitInputVarGenerated(in_var_handles);
-    if (in->generated_op_) {
-      in->generated_op_->Wait(dev_ctxes_[in->place_]);
-    }
-  }
  std::vector<int64_t> out_rows;
  std::vector<Tensor> in_tensors;
@@ -70,34 +66,32 @@ void GatherOpHandle::RunImpl() {
  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
  // gather the inputs
-  for (auto *in : in_var_handles) {
+  for (auto *in_handle : in_var_handles) {
-    auto in_handle = static_cast<VarHandle *>(in);
    auto in_p = in_handle->place_;
    in_places.push_back(in_p);
    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                      "Places must be all on CPU or all on CUDA.");
-    auto in_var =
+    auto *in_var =
-        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
    auto &in_sr = in_var->Get<framework::SelectedRows>();
    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                      "The type of input is not consistent.");
    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
                      "The height of inputs is not consistent.");
-    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(),
                      "The dims of inputs is not consistent.");
-    auto in_sr_rows = in_sr.rows();
+    auto &in_sr_rows = in_sr.rows();
    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
    in_tensors.emplace_back(in_sr.value());
  }
  // write the output
-  auto &out_place = out_var_handles[0]->place_;
+  auto &out_place = out_var_handle->place_;
-  auto out_scope_idx = out_var_handles[0]->scope_idx_;
+  auto out_scope_idx = out_var_handle->scope_idx_;
-  auto out_var =
+  auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
-      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
  auto out = out_var->GetMutable<framework::SelectedRows>();
  out->set_height(pre_in.height());
@@ -110,13 +104,27 @@ void GatherOpHandle::RunImpl() {
  Tensor *out_tensor = out->mutable_value();
  // copy
-  int s = 0, e = 0;
+  auto dev_ctx = dev_ctxes_[out_place];
-  for (size_t j = 0; j < in_tensors.size(); ++j) {
+  RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] {
-    e += in_tensors[j].dims()[0];
+    int s = 0, e = 0;
-    auto sub_out = out_tensor->Slice(s, e);
+    for (size_t j = 0; j < in_tensors.size(); ++j) {
-    paddle::framework::TensorCopy(in_tensors[j], out_place,
+      e += in_tensors[j].dims()[0];
-                                  *(dev_ctxes_[in_places[j]]), &sub_out);
+      auto sub_out = out_tensor->Slice(s, e);
-    s = e;
+      paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx),
+                                    &sub_out);
+      s = e;
+    }
+  });
+}
+void GatherOpHandle::WaitInputVarGenerated(
+    const std::vector<VarHandle *> &in_var_handles) {
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      for (auto pair : dev_ctxes_) {
+        in->generated_op_->Wait(pair.second);
+      }
+    }
  }
 }

--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 struct GatherOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
+ public:
-  const std::vector<platform::Place> &places_;
  GatherOpHandle(const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places);
@@ -41,6 +39,11 @@ struct GatherOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
 struct TestGatherOpHandle {
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
  Scope g_scope_;
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
@@ -71,14 +72,19 @@ struct TestGatherOpHandle {
  void InitGatherOp(size_t input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
-      local_scopes_[j]->Var("out");
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
    }
-    local_scopes_[input_scope_idx]->Var("input");
+    param_scopes_[input_scope_idx]->Var("out");
    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
@@ -115,7 +121,8 @@ struct TestGatherOpHandle {
    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
         ++input_scope_idx) {
-      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
      auto value = in_selected_rows->mutable_value();
      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -128,10 +135,11 @@ struct TestGatherOpHandle {
      value->Resize(kDims);
    }
-    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
-    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
    out_selected_rows->mutable_value()->ShareDataWith(
@@ -155,7 +163,8 @@ struct TestGatherOpHandle {
    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
    float* ct = result_tensor.data<float>();
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
+    for (int64_t j = 0;
+         j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
    }
  }

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
+    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
    platform::NCCLContextMap *nccl_ctxs)
    : loss_var_name_(loss_var_name),
      places_(places),
@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes)
+    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
    : loss_var_name_(loss_var_name),
      places_(places),
      local_scopes_(local_scopes) {
@@ -53,6 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  for (auto &p : params) {
    grad_names_.insert(GradVarName(p));
  }
+  skip_scale_loss_ = skip_scale_loss;
 }
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -60,7 +61,8 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
                                                const platform::Place &p,
                                                const size_t &i) const {
  auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
  auto var_names = op.InputArgumentNames();
@@ -76,6 +78,33 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
  }
 }
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
+                                            OpDesc *send_op) const {
+  if (send_op == nullptr) {
+    return false;
+  }
+  auto checker = [&](const std::vector<std::string> opvars,
+                     const std::vector<std::string> sendvars) -> bool {
+    bool is_dist_train_op = false;
+    for (auto &var : opvars) {
+      if (var.find(".block") != std::string::npos &&
+          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
+        is_dist_train_op = true;
+        break;
+      }
+    }
+    return is_dist_train_op;
+  };
+  if (op.Type() == "split") {
+    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
+  } else if (op.Type() == "concat") {
+    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
+  }
+  return false;
+}
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  auto graph = new SSAGraph();
@@ -87,103 +116,40 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
      places_.size());
-  bool is_forwarding = true;
+  // Find "send" op first for split is in front of send.
+  OpDesc *send_op = nullptr;
  for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
+    if (op->Type() == "send") {
-    if (!is_forwarding) {
+      send_op = op;
-      // FIXME(yy): Do not hard code like this
+      break;
-      if (op->OutputArgumentNames().size() == 1 &&
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
-        continue;  // Drop fill 1. for backward coeff;
-      }
-    }
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
    }
+  }
-    for (size_t i = 0; i < places_.size(); ++i) {
+  bool is_forwarding = true;
-      auto &p = places_[i];
+  for (auto *op : program.Block(0).AllOps()) {
-      auto *s = local_scopes_[i];
+    if (op->Type() == "send") {
+      // append send op if program is distributed trainer main program.
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
+      // always use the first device
-      auto *op_handle = result.ops_.back().get();
+      CreateSendOp(&result, *op);
-      CreateOpHandleIOs(&result, *op, p, i);
+    } else if (IsDistTrainOp(*op, send_op)) {
+      CreateComputationalOps(&result, *op, 1);
-      auto var_names = op->OutputArgumentNames();
+    } else if (IsScaleLossOp(*op)) {
+      if (!skip_scale_loss_) {
-      if (is_forwarding) {
+        CreateScaleLossGradOp(&result);
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
      }
-    }
-    if (change_forward) {
      is_forwarding = false;
-    }
+    } else {
+      CreateComputationalOps(&result, *op, places_.size());
-    if (!is_forwarding) {
+      if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
+        // Currently, we assume that once gradient is generated, it can be
-      // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
-      // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
-      // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
-      // the input when we get the gradient, which is not considered at present.
+        // present.
-      for (auto &og : var_names) {
+        for (auto &og : op->OutputArgumentNames()) {
-        if (grad_names_.count(og) != 0 &&
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
+            InsertNCCLAllReduceOp(&result, og);
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-            auto var = new VarHandle(vars.size() - 1, i, og, p);
-            vars.emplace_back(var);
-            op_handle->AddOutput(var);
          }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
        }
      }
    }
@@ -207,7 +173,96 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  }
  return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op,
+                                                     size_t num_places) const {
+  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -34,12 +34,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
                          const std::vector<Scope *> &local_scopes,
+                          bool skip_scale_loss,
                          platform::NCCLContextMap *nccl_ctxs);
 #else
  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes);
+                          const std::vector<Scope *> &local_scopes,
+                          bool skip_scale_loss);
 #endif
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -57,6 +59,24 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nccl_ctxs_;
 #endif
+  bool skip_scale_loss_;
+  bool IsScaleLossOp(const OpDesc &op) const;
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+                              size_t num_places) const;
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include <algorithm>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
 namespace paddle {
 namespace framework {
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
  }
 }
-struct ReduceLoDTensor {
-  const std::vector<LoDTensor> &src_tensors_;
-  LoDTensor &dst_tensor_;
-  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
-      : src_tensors_(src), dst_tensor_(*dst) {}
-  template <typename T>
-  void operator()() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
-    auto &t0 = src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
-    dst_tensor_.Resize(t0.dims());
-    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
-      auto &t = src_tensors_[i];
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
-      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
 void NCCLAllReduceOpHandle::RunImpl() {
  if (inputs_.size() == 1) {
    return;  // No need to all reduce when GPU count = 1;
@@ -69,21 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
    int dtype = -1;
    size_t numel = 0;
-    std::vector<LoDTensor> lod_tensors;
+    std::vector<const LoDTensor *> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
-      lod_tensors.emplace_back(lod_tensor);
+      lod_tensors.emplace_back(&lod_tensor);
    }
-    if (platform::is_gpu_place(lod_tensors[0].place())) {
+    if (platform::is_gpu_place(lod_tensors[0]->place())) {
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
-        auto &lod_tensor = lod_tensors[i];
+        auto &lod_tensor = *lod_tensors[i];
        void *buffer = const_cast<void *>(lod_tensor.data<void>());
        if (dtype == -1) {
@@ -119,7 +93,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &scope =

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -27,10 +27,6 @@ namespace framework {
 namespace details {
 struct NCCLAllReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                        const std::vector<platform::Place> &places,
                        const platform::NCCLContextMap &ctxs);
@@ -43,6 +39,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -27,28 +27,15 @@ namespace details {
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 class OpHandleBase {
- private:
-  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
 public:
-  std::vector<VarHandleBase *> inputs_;
-  std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
-#endif
  OpHandleBase() {}
+  virtual ~OpHandleBase();
  std::string DebugString() const;
  virtual std::string Name() const = 0;
-  virtual ~OpHandleBase();
  void Run(bool use_event);
  virtual void Wait(platform::DeviceContext *waited_dev);
@@ -61,6 +48,18 @@ class OpHandleBase {
  // will likely block other computations.
  virtual bool IsMultiDeviceTransfer() { return false; }
+  const platform::DeviceContext *DeviceContext(platform::Place place) {
+    return dev_ctxes_[place];
+  }
+  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
+    dev_ctxes_[place] = ctx_;
+  }
+  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);
@@ -68,6 +67,18 @@ class OpHandleBase {
                         const std::function<void()> &callback);
  virtual void RunImpl() = 0;
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct ReduceLoDTensor {
+  const std::vector<const LoDTensor *> &src_tensors_;
+  LoDTensor &dst_tensor_;
+  ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = *src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = *src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+inline void GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selecte_rows_,
+    const std::vector<platform::Place> &in_places,
+    const std::unordered_map<platform::Place, platform::DeviceContext *,
+                             platform::PlaceHash> &dev_ctxes,
+    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+  std::vector<Tensor> in_tensors;
+  std::vector<int64_t> out_rows;
+  for (auto in_sr_ptr : src_selecte_rows_) {
+    auto &in_sr = *in_sr_ptr;
+    in_tensors.emplace_back(in_sr.value());
+    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+  }
+  auto &pre_in = src_selecte_rows_[0];
+  auto &dst_tensor = *dst_selecte_rows;
+  dst_tensor.set_height(pre_in->height());
+  dst_tensor.set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in->GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  dst_tensor.mutable_value()->Resize(out_dim);
+  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+  Tensor *out_tensor = dst_tensor.mutable_value();
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes.at(in_places[j])), &sub_out);
+    s = e;
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace framework {
+namespace details {
+struct ReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+  template <typename T>
+  std::vector<const T *> GetInputValues(
+      const std::vector<VarHandle *> &in_var_handles,
+      const std::vector<const Scope *> &var_scopes) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -23,10 +25,6 @@ namespace framework {
 namespace details {
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  float coeff_;
-  Scope *scope_;
-  platform::Place place_;
  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
                        platform::DeviceContext *context);
@@ -36,6 +34,11 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+ private:
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,10 @@ void SendOpHandle::RunImpl() {
    }
    in->generated_op_->Wait(dev_ctxes_[p]);
  }
-  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
+  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
+  // lock.
+  op_->Run(*tmp_scope, place_);
 }
 std::string SendOpHandle::Name() const { return "send"; }

--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -28,10 +28,6 @@ namespace framework {
 namespace details {
 struct SendOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  const Scope* local_scope_;
-  const platform::Place& place_;
  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
               const platform::Place& place);
@@ -43,6 +39,11 @@ struct SendOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/operators/increment_op.cu
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
--- a/paddle/fluid/operators/iou_similarity_op.cu
+++ b/paddle/fluid/operators/iou_similarity_op.cu
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
--- a/paddle/fluid/operators/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mul_mkldnn_op.cc
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ b/paddle/fluid/operators/detail/simple_block_queue.h
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/recordio/README.md
+++ b/paddle/fluid/recordio/README.md
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/demo/text_classification/.gitignore
+++ b/python/paddle/fluid/tests/demo/text_classification/.gitignore
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
--- a/tools/aws_benchmarking/README.md
+++ b/tools/aws_benchmarking/README.md
--- a/tools/aws_benchmarking/client/cluster_launcher.py
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
--- a/tools/aws_benchmarking/server/cluster_master.py
+++ b/tools/aws_benchmarking/server/cluster_master.py
--- a/tools/aws_benchmarking/server/pserver.sh.template
+++ b/tools/aws_benchmarking/server/pserver.sh.template
--- a/tools/aws_benchmarking/server/trainer.sh.template
+++ b/tools/aws_benchmarking/server/trainer.sh.template