Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_avg

d1a7b47e · wanghaoshuang · 764723d4 · f0af1398 · d1a7b47e · d1a7b47e
167 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 if(MOBILE_INFERENCE)
@@ -147,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
+include(external/threadpool)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)

--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
+INCLUDE(ExternalProject)
+SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
+SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
+INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+ExternalProject_Add(
+    extern_threadpool
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
+    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
+    PREFIX          ${THREADPOOL_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
+    add_library(simple_threadpool STATIC ${dummyfile})
+else()
+    add_library(simple_threadpool INTERFACE)
+endif()
+add_dependencies(simple_threadpool extern_threadpool)
+LIST(APPEND external_project_dependencies simple_threadpool)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME)
  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # for now to enable dist CI.
  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
@@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME)
          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          "${ABS_PROTO}"
          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it

--- a/doc/design/images/parallel_executor_overview.dot
+++ b/doc/design/images/parallel_executor_overview.dot
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+  subgraph cluster_train {
+    label="forward_backward"
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+}
--- a/doc/design/images/parallel_executor_overview.png
+++ b/doc/design/images/parallel_executor_overview.png
--- a/doc/design/parallel_executor.md
+++ b/doc/design/parallel_executor.md
+# ParallelExecutor
+## Background
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+## Overview of MultiGPUs logic
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+![alt](images/parallel_executor_overview.png)
+There are several optimizations for this logic.
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+## Static single assignment Graph
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+The data structure of `SSA` graph is:
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector<OpHandleBase*> pending_ops_;
+  string name;
+  Place place;
+  size_t version;
+};
+struct OpHandleBase {
+  vector<OpHandleBase*> inputs_;
+  vector<OpHnadleBase*> outputs_;
+};
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
+  // All ops
+  vector<OpHandleBase> ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+## Execute SSA Graph
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+## Synchronize GPU Kernels
+The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+The `wait` are implemented by two strategies:
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+## What's next?
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
+../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
+../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
+../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ b/doc/fluid/build_and_install/docker_install_en.rst
+../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
-安装与使用
------------
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
+../../v2/build_and_install/index_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
-Build and Install
------------
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
+../../v2/build_and_install/index_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_cn.rst
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
+../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_en.rst
+++ b/doc/fluid/build_and_install/pip_install_en.rst
+../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
--- a/doc/fluid/design/algorithm/index_cn.rst
+++ b/doc/fluid/design/algorithm/index_cn.rst
+梯度更新算法
+------------
+.. toctree::
+  :maxdepth: 1
+  parameter_average.md
--- a/doc/fluid/design/algorithm/index_en.rst
+++ b/doc/fluid/design/algorithm/index_en.rst
+Gradient Update Algorithm
+--------------------------------------
+.. toctree::
+  :maxdepth: 1
+  parameter_average.md
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
 Here are some initial thoughts. Your comments are welcome!
-### Required CMake Function
+# Required CMake Function
 I think we need only the following few CMake functions to make a project description mean and clean:
@@ -25,7 +25,7 @@ Also,
 - to describe external dependencies, we need `external_library`.
 - to build shared libraries, we need `shared_library`.
-### An Example Project
+## An Example Project
 Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
@@ -102,11 +102,11 @@ shared_library(api
 ```
-### Implementation
+## Implementation
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-### Using Package Manager For Go
+## Using Package Manager For Go
 Building Go binaries and libraries need to satisfy their dependencies, generally
 we can do `go get ./...` to download and compile all external dependencies. The
@@ -122,7 +122,7 @@ problems are:
   at many cloud file hosting, so users what to compile paddle by themselves can
   download this "vendor" package from a mirror site.
-#### Choose A Suitable Tool
+### Choose A Suitable Tool
 As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
 list dozens of Go package managers. We choose the tool using following principles:
@@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
 such problems, but it's currently at Alpha stage. So the best choice now is
 glide obviously.
-#### Manage Go Packages
+### Manage Go Packages
 - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively

--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
 To create and invoke readers, some new ops are introduced:
-### CreateReaderOp
+### Operators That Create Readers
 Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
 The forwarding ops of the corresponding `main_program` would be like this:
 ```
-while_op {
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
    has_next = has_next_op(double_buffer_reader)
    if_else_op(has_next) {
        batch_data = read_op(double_buffer_reader)
        ... (subsequent training ops)
    } else {
        reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
    }
 }
 ```
-Two important considerations for these programs are as follows:
+A few important considerations for these programs are as follows:
-1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
-2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+### Simplify Configuration by MultiPassReader
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+With `MultiPassReader`, the startup program would be like this:
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+The forwarding part of the corresponding `main_program` would be like this:
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
+核心概念
+-------------
+.. toctree::
+  :maxdepth: 1
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
+Core Concepts
+--------------------------------------
+.. toctree::
+  :maxdepth: 1
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
@@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
 A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
-# Interface Design
+## Interface Design
 ```cpp
 class Variable {

--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
+# Design Doc: Var_desc
 ## Background
 PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.

--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
+# Channel Design
+## Introduction
+A Channel is a data structure that allows for synchronous interprocess 
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+## How to use it
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+### Create a channel
+Creates a new channel that takes in variables of a specific dtype.
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents 
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+### Close a channel
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+- **fluid.channel_close(channel)**
+```
+fluid.channel_close(ch)
+```
+### Send data to a channel
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`, 
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and 
+`ChannelHolder` are supported.
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+### Receive data from a channel
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+## How it Works
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+### QueueMessage
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the 
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being 
+processed by the channel.
+### Queues
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+### State diagram
+#### Channel Send
+<p align="center">
+<img src="./images/channel_send.png"/><br/>
+</p>
+#### Channel Receive
+<p align="center">
+<img src="./images/channel_recv.png"/><br/>
+</p>
+## Limitations and Considerations
+### Variable Copy
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables 
+datatypes.
--- a/doc/fluid/design/concurrent/images/channel_recv.png
+++ b/doc/fluid/design/concurrent/images/channel_recv.png
--- a/doc/fluid/design/concurrent/images/channel_send.png
+++ b/doc/fluid/design/concurrent/images/channel_send.png
--- a/doc/fluid/design/concurrent/index_cn.rst
+++ b/doc/fluid/design/concurrent/index_cn.rst
+并发编程
+------------
+.. toctree::
+  :maxdepth: 1
+  concurrent_programming.md
+  parallel_do.md
--- a/doc/fluid/design/concurrent/index_en.rst
+++ b/doc/fluid/design/concurrent/index_en.rst
+Concurrent Programming
+-------------------------
+.. toctree::
+  :maxdepth: 1
+  concurrent_programming.md
+  parallel_do.md
--- a/doc/fluid/design/data_type/index_cn.rst
+++ b/doc/fluid/design/data_type/index_cn.rst
+数据类型
+------------
+.. toctree::
+  :maxdepth: 1
+  float16.md
--- a/doc/fluid/design/data_type/index_en.rst
+++ b/doc/fluid/design/data_type/index_en.rst
+Data Type
+------------
+.. toctree::
+  :maxdepth: 1
+  float16.md
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
-## Design Doc: Distributed Lookup Table Operator
+# Design Doc: Distributed Lookup Table Operator
 A lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.

--- a/doc/fluid/design/dist_train/index_cn.rst
+++ b/doc/fluid/design/dist_train/index_cn.rst
+分布式训练
+------------
+.. toctree::
+  :maxdepth: 1
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
--- a/doc/fluid/design/dist_train/index_en.rst
+++ b/doc/fluid/design/dist_train/index_en.rst
+Distributed Training
+---------------------
+.. toctree::
+  :maxdepth: 1
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
--- a/doc/fluid/design/dynamic_rnn/index_cn.rst
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
+动态RNN
+------------
+.. toctree::
+  :maxdepth: 1
+  rnn.md
+  rnn_design.md
--- a/doc/fluid/design/dynamic_rnn/index_en.rst
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
+Dynamic RNN
+------------
+.. toctree::
+  :maxdepth: 1
+  rnn.md
+  rnn_design.md
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -99,7 +99,7 @@ private:
    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
 2. 对于不感知 `lod_start_pos` 的Op足够透明
-3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
 具体的设计分为以下3小节
@@ -189,7 +189,7 @@ struct SortedSeqItem {
 std::vector<SortedSeqItem> sorted_seqs;
 ```
-来追踪序列排序后的位置，并添加一个新的接口 
+来追踪序列排序后的位置，并添加一个新的接口
 ```c++
 std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
@@ -233,7 +233,10 @@ x    x
 - 将每个序列concat 为规则的mini-batch表示
 ## 参考文献
-1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
-2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
--- a/doc/fluid/design/execution/index_cn.rst
+++ b/doc/fluid/design/execution/index_cn.rst
+执行流程
+-------------
+.. toctree::
+  :maxdepth: 1
+  switch.md
+  if_else_op.md
--- a/doc/fluid/design/execution/index_en.rst
+++ b/doc/fluid/design/execution/index_en.rst
+Execution Process
+--------------------------------------
+.. toctree::
+  :maxdepth: 1
+  switch.md
+  if_else_op.md
--- a/doc/fluid/design/execution/switch.md
+++ b/doc/fluid/design/execution/switch.md
-### Design Doc: Switch
+# Design Doc: Switch
-### Background
+## Background
 Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
@@ -19,7 +19,7 @@ with switch() as switch:
        fluid.print("Case 3")
 ```
-### The Semantics
+## The Semantics
 1. A `switch` control-flow checks cases one-by-one.
 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.

--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
 设计思想
 ------------
+.. toctree::
+  :maxdepth: 1
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  muti_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
 Design
 ------------
+.. toctree::
+  :maxdepth: 1
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  muti_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
--- a/doc/fluid/design/interface/index_cn.rst
+++ b/doc/fluid/design/interface/index_cn.rst
+多语言接口
+------------
+TBD
--- a/doc/fluid/design/interface/index_en.rst
+++ b/doc/fluid/design/interface/index_en.rst
+Multi-Language Interface
+-----------------------
+TBD
--- a/doc/fluid/design/memory/index_cn.rst
+++ b/doc/fluid/design/memory/index_cn.rst
+内存管理
+------------
+.. toctree::
+  :maxdepth: 1
+  memory_optimization.md
--- a/doc/fluid/design/memory/index_en.rst
+++ b/doc/fluid/design/memory/index_en.rst
+Memory Management
+-------------------
+.. toctree::
+  :maxdepth: 1
+  memory_optimization.md
--- a/doc/fluid/design/modules/evaluator.md
+++ b/doc/fluid/design/modules/evaluator.md
-## Evaluator Design
+# Evaluator Design
-### Problem Statement
+## Problem Statement
 During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
-### Evaluator Design
+## Evaluator Design
 Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
 1. Initialize the metric state and add it into the block.
@@ -14,11 +14,11 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr
 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
-### Implementation
+## Implementation
-This design is shown in the Python API. 
+This design is shown in the Python API.
-Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
 ```python
 class Evaluator(object):
    """
@@ -32,7 +32,7 @@ class Evaluator(object):
       The initialization of Evaluator should be responsible for:
       create metric states and append to the main_program
-       """ 
+       """
       pass
    def _update_ops(self, input, label, **kwargs)
@@ -40,14 +40,14 @@ class Evaluator(object):
       Add mini-batch evaluator caculate operators to the main_program.
       Add increment operator to accumulate the metric states.
       """
    def reset(self, executor, reset_program=None):
      """
      Reset metric states at the begin of each pass/user specified batch number.
      Execute the reset_program to reset the states.
      """
    def eval(self, executor, eval_program=None):
      """

--- a/doc/fluid/design/modules/index_cn.rst
+++ b/doc/fluid/design/modules/index_cn.rst
+代码结构和重要模块
+-----------------
+.. toctree::
+  :maxdepth: 1
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
--- a/doc/fluid/design/modules/index_en.rst
+++ b/doc/fluid/design/modules/index_en.rst
+Code Structure and Important Modules
+-------------------------------------
+.. toctree::
+  :maxdepth: 1
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
--- a/doc/fluid/design/modules/net_op_design.md
+++ b/doc/fluid/design/modules/net_op_design.md
 # Network Design
 `Network` is the container and controller of a set of operators,
-user can build a real network from a `NetDesc` which is a protobuf message 
+user can build a real network from a `NetDesc` which is a protobuf message
 and use `Network.Run()` to run all the operators in the network.
-A network object knows all Operators belonging to this network. Variables, 
+A network object knows all Operators belonging to this network. Variables,
-which are inputs and outputs of these operators, 
+which are inputs and outputs of these operators,
 are created and managed by a hierarchy of Scope objects.
-# API
+## API
-## Net
+### Net
 To make the `Network` extendable, a base class is defined like this
 ```c++
@@ -43,8 +43,8 @@ class Net {
 };
 ```
-All network implementations should build networks from a protobuf message which 
+All network implementations should build networks from a protobuf message which
-describes the structure of a real network; `Run` method should be implemented by 
+describes the structure of a real network; `Run` method should be implemented by
 all implementations to offer a universal method to forward or backward compute a network.
 `Net::Create` is a method of factory pattern and can be implemented like
@@ -64,7 +64,7 @@ std::unique<Net> Net::Create(const NetDesc& def) {
 ```
 Network is designed as the container of operators. to make it more extendable,
-we decouple it from the related variable resources. 
+we decouple it from the related variable resources.
 `Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
@@ -80,7 +80,7 @@ if (net) {
 }
 ```
-## `PlainNet` as a simple implementation of `BaseNet`
+### `PlainNet` as a simple implementation of `BaseNet`
 A very basic implementation is as follows. All it does is simply to run every operators in sequence.
@@ -211,9 +211,9 @@ class NetBuilder final {
 }
 ```
-## Compatibility with RNN
+### Compatibility with RNN
-Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, 
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
 for example we can implement a simple recurrent neural network as follows
 ```c++

--- a/doc/fluid/design/modules/optimizer.md
+++ b/doc/fluid/design/modules/optimizer.md
-## Optimizer Design
+# Optimizer Design
-### The Problem
+## The Problem
 A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca
 In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
-### High-level Python API to describe the training process
+## High-level Python API to describe the training process
 1. User write code to describe the network:
@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim
 	sess.run(target= opt_op_list, ...)
 	```
-#### Optimizer Python interface:
+### Optimizer Python interface:
 ```python
 class Optimizer(object):

--- a/doc/fluid/design/motivation/index_cn.rst
+++ b/doc/fluid/design/motivation/index_cn.rst
+设计动机和目标
+-------------
+.. toctree::
+  :maxdepth: 1
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
--- a/doc/fluid/design/motivation/index_en.rst
+++ b/doc/fluid/design/motivation/index_en.rst
+Design Motivations and Goals
+--------------------------------------
+.. toctree::
+  :maxdepth: 1
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime
 ---
-# Operator/OpWithKernel/OpKernel
+## Operator/OpWithKernel/OpKernel
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
 ---
-# Operator
+## Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
 * `Operator` is the fundamental building block of the user interface.
@@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime
 ---
-# OpWithKernel/Kernel
+## OpWithKernel/Kernel
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
@@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime
 ---
-# Why separate Kernel and Operator
+## Why separate Kernel and Operator
 * Separate GPU and CPU code.
    * Make Paddle capable of running without GPU.
@@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime
    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
-# Libraries for Kernel development
+## Libraries for Kernel development
 * `Eigen::Tensor` contains basic math and element-wise functions.
    * Note that `Eigen::Tensor` has broadcast implementation.
@@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime
 * Hand-writing `GPUKernel` and `CPU` code
    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Registration
+## Operator Registration
-## Why is registration necessary?
+### Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
-## How is registration implemented?
+### How is registration implemented?
 Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
 ---
-# The Registry Map
+## The Registry Map
 ### `OpInfoMap`
@@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 - **`checker`**: Used to check attributes.
 ---
-# Related Concepts
+## Related Concepts
 ### Op_Maker
 It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
@@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
 ---
-# Registration Process
+## Registration Process
 1. Write an Op class and its gradient Op class, if required.
 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
 3. Invoke the macro `REGISTER_OP`. This macro will
@@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
 ---
-# Backward Module (1/2)
+## Backward Module (1/2)
 ### Create Backward Operator
 - Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 ---
-# Backward Module (2/2)
+## Backward Module (2/2)
 ### Build Backward Network
 - **Input**: a graph of forward operators
 - **Output**: a graph of backward operators
@@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ---
-# Scope, Variable, Tensor
+## Scope, Variable, Tensor
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
@@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 ---
-# Block (in design)
+## Block (in design)
-## the difference between original RNNOp and Block
+### the difference between original RNNOp and Block
 - As an operator is more intuitive than `RNNOp`,
 - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
 - Fits the compile-time/ runtime separation design paradigm.
@@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 ---
-# Milestone
+## Milestone
 - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
 - Model migration
  - Framework development gives **priority support** to model migration, for example,
@@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 - Accept imperfection, concentrate on solving the specific problem at the right price.
 ---
-# Control the migration quality
+## Control the migration quality
 - Compare the performance of migrated models with old ones.
 - Follow the google C++ style guide.
 - Build the automatic workflow of generating Python/C++ documentations.

--- a/doc/fluid/design/muti_devices/index_cn.rst
+++ b/doc/fluid/design/muti_devices/index_cn.rst
+多设备支持
+------------
+.. toctree::
+  :maxdepth: 1
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
--- a/doc/fluid/design/muti_devices/index_en.rst
+++ b/doc/fluid/design/muti_devices/index_en.rst
+Multi-Device Support
+----------------------
+.. toctree::
+  :maxdepth: 1
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
-## Problem
+# Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 In the current design, we use KernelType to describe one kernel.

--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
-## Background
+# Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 The `OpKernelType ` is as follows:

--- a/doc/fluid/design/network/index_cn.rst
+++ b/doc/fluid/design/network/index_cn.rst
+复杂网络设计
+------------
+.. toctree::
+  :maxdepth: 1
+  sequence_decoder.md
--- a/doc/fluid/design/network/index_en.rst
+++ b/doc/fluid/design/network/index_en.rst
+Complex Network Design
+------------------------
+.. toctree::
+  :maxdepth: 1
+  sequence_decoder.md
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Python API Definition
  - 格式：
      [Python API Definition]
  - 示例
      ```
      fc(input,
         size,
@@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
      ```
 - Function Description
  - 格式
      本模块应包含以下内容（排列顺序为文档撰写顺序）：
      [Function Description]
      [Formula]
      [Symbols' Descriptions if necessary]
      [References if necessary]
  - 示例
      [Function Description]
@@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
      [References if necessary]
      因fc没有必要列出的参考文献，故该内容省略。其他情况下需明确给出对应的参考文献和对应连接，以 layer_norm 为例：
      ```
      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
      ```
 - Args Description
  - 格式
      \[Arg's Name\][(Data Type, Default Value)][Description]
  - 示例
      fc的部分参数注释如下：
@@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
      ```
 - Returns
  - 格式
      [Name][Shape]
  - 示例
      ```
      Returns:
          A tensor variable storing the transformation result.
      ```
      当返回值为包含多个参数的tuple时，应按顺序逐个介绍各参数，以dynamic_lstm为例：
      ```
      Returns:
          A tuple containing:
            The hidden state of LSTM whose shape is (T X D).
            The cell state of LSTM whose shape is (T X D).
      ```
 - Raises
  - 格式
      [Exception Type][Condition]
  - 示例
      ```
      Raises:
          ValueError: If the rank of the input is less than 2.
@@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Note
  - 格式
     [Note]
  - 示例
@@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
          2. When num_heads == 1, scaled_dot_product_attention has no learnable
             parameters.
      ```
 - Examples
  - 格式
      \[Python Code Snipper]
  - 示例
      ```
      Examples:
          .. code-block:: python

--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
 开发标准
 ------------
+.. toctree::
+  :maxdepth: 1
+  new_op_en.md
+  new_op_kernel_en.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process.md
+  op_markdown_format.md
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
 Development
 ------------
-This is Development page
+.. toctree::
+  :maxdepth: 1
+  new_op_en.md
+  new_op_kernel_en.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process.md
+  op_markdown_format.md
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
-## Operator's Parameter Name Convention
+# Operator's Parameter Name Convention
 To make the operator document itself more clear, we recommend operator names obey the listing conventions.
-### OpProtoMaker names
+## OpProtoMaker names
 When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 - Order.
  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
-### Best Practice
+## Best Practice
 Here we give some examples to show how these rules will be used.

--- a/doc/fluid/dev/new_op_kernel_en.md
+++ b/doc/fluid/dev/new_op_kernel_en.md
-## Add Kernels for a New Device
+# Add Kernels for a New Device
-### Background
+## Background
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
 [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
-### Write Kernels for A New Device 
+## Write Kernels for A New Device
-#### Add A New Device
+### Add A New Device
  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
@@ -23,7 +23,7 @@ enum class LibraryType {
 ```
-#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
 If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
@@ -45,7 +45,7 @@ struct CUDAPlace {
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 ```
-#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
 After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
 ```cpp
@@ -58,7 +58,7 @@ class DeviceContext {
 };
 ```
-#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
 A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase {
 ```
-#### Register the OpKernel to framework
+### Register the OpKernel to framework
 After writing the components described above, we should register the kernel to the framework.
@@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
 	       paddle::operators::CUDNNConvOpKernel<float>,
 	       paddle::operators::CUDNNConvOpKernel<double>);

--- a/doc/fluid/dev/op_markdown_format.md
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -15,26 +15,26 @@ The signature of the operator.
 Each section mentioned above has been covered in further detail in the rest of the document.
-# PaddlePaddle Operator Name
+## PaddlePaddle Operator Name
 This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
 `array to lod tensor` should be written as `array_to_lod_tensor`.
 This naming convention should be standard across all PaddlePaddle operators.
-# Standard Operator Name
+## Standard Operator Name
 This is the standard name of the operator as used in the community. The general standard is usually:
 - Standard abbreviations like `SGD` are written in all capital letters.
 - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
 - Keep numbers inside a word as is, with no boundary delimiters.
 - Follow the name of the operator with the keyword: `Activation Operator.`
-# Operator description
+## Operator description
 This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
-# LaTeX equation
+## LaTeX equation
 This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
-# The signature
+## The signature
 This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
 `Section :
 VariableName : (VariableType) VariableDescription

--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
-## 在Paddle中如何使用Eigen
+# 在Paddle中如何使用Eigen
 神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
-### Eigen Tensor模块
+## Eigen Tensor模块
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 Paddle Tensor定义在framework目录下，其主要接口如下：
@@ -20,14 +20,14 @@ class Tensor {
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T* data();
  /**
   * @brief   Return a pointer to mutable memory block.
   * @note    If not exist, then allocation.
   */
  template <typename T>
  inline T* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
   *
@@ -38,17 +38,17 @@ class Tensor {
   */
  template <typename T>
  inline T* mutable_data(DDim dims, platform::Place place);
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
 private:  
  /*! holds the memory block if allocated. */
  std::shared_ptr<Placeholder> holder_;
  /*! points to dimensions of memory block. */
  DDim dim_;
 };
@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口，可以实现从paddle::framework
-### 实现计算
+## 实现计算
 当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。

--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
-## How to use Eigen in Paddle
+# How to use Eigen in Paddle
 Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
-### Eigen Tensor Module
+## Eigen Tensor Module
 The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
 For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 Paddle Tensor's is defined in the framework directory with the following interface:
@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ```
-### paddle::framework::Tensor到EigenTensor的转换
+## paddle::framework::Tensor到EigenTensor的转换
 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
-### Implementing Computation
+## Implementing Computation
 While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.

--- a/doc/fluid/getstarted/concepts/index_cn.rst
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
+基本使用概念
+============
+TBD
--- a/doc/fluid/getstarted/concepts/index_en.rst
+++ b/doc/fluid/getstarted/concepts/index_en.rst
+Concepts
+============
+TBD
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
 新手入门
------------
+============
-新手入门
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+..  toctree::
+  :maxdepth: 1
+  quickstart_cn.rst
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+..  toctree::
+  :maxdepth: 1
+  concepts/use_concepts_cn.rst
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
 GET STARTED
------------
+============
-This is get started page
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+..  toctree::
+  :maxdepth: 1
+  quickstart_en.rst
+While using PaddlePaddle to build applications, please understand some basic concepts.
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+..  toctree::
+  :maxdepth: 1
+  concepts/index_en.rst
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
+../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
+../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
 进阶使用
 ------------
+.. toctree::
+  :maxdepth: 1
+  optimization/index_cn.rst
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
 HOW TO
 ------------
-This is how to page
+.. toctree::
+  :maxdepth: 1
+  optimization/index_en.rst
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ b/doc/fluid/howto/optimization/benchmark/README.md
+../../../../../benchmark/cluster/README.md
\ No newline at end of file
--- a/doc/fluid/howto/optimization/benchmark/index_cn.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
+基准
+------------
+.. toctree::
+  :maxdepth: 1
+  vgg16/README.md
+  README.md
--- a/doc/fluid/howto/optimization/benchmark/index_en.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
+Benchmark
+------------
+.. toctree::
+  :maxdepth: 1
+  vgg16/README.md
+  README.md
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
-## Python代码的性能分析
+# Python代码的性能分析
 ### 生成性能分析文件

--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -14,7 +14,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
-## Profiling the Python Code
+# Profiling the Python Code
 ### Generate the Performance Profiling File
@@ -81,7 +81,7 @@ focus on. We can sort above profiling file by tottime:
 We can see that the most time-consuming function is the `built-in
 method run`, which is a C++ function in `libpaddle.so`.  We will
-explain how to profile C++ code in the next section.  At this 
+explain how to profile C++ code in the next section.  At this
 moment, let's look into the third function `sync_with_cpp`, which is a
 Python function.  We can click it to understand more about it:

--- a/doc/fluid/howto/optimization/index_cn.rst
+++ b/doc/fluid/howto/optimization/index_cn.rst
+性能优化
+------------
+.. toctree::
+  :maxdepth: 1
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
--- a/doc/fluid/howto/optimization/index_en.rst
+++ b/doc/fluid/howto/optimization/index_en.rst
+Performance Optimization
+---------------------------
+.. toctree::
+  :maxdepth: 1
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
-## how to use timeline tool to do profile
+# how to use timeline tool to do profile
 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.

--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -5,8 +5,8 @@
  :maxdepth: 1
  getstarted/index_cn.rst
-  design/index_cn.rst
  build_and_install/index_cn.rst
+  design/index_cn.rst
  howto/index_cn.rst
  dev/index_cn.rst
  faq/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
@@ -5,8 +5,8 @@
  :maxdepth: 1
  getstarted/index_en.rst
-  design/index_en.rst
  build_and_install/index_en.rst
+  design/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
  faq/index_en.rst
--- a/doc/fluid/design/interface/00.why_plain_c.md
+++ b/doc/fluid/design/interface/00.why_plain_c.md
--- a/doc/fluid/design/interface/01.inference_implementation.md
+++ b/doc/fluid/design/interface/01.inference_implementation.md
--- a/doc/v2/design/interface/index_cn.rst
+++ b/doc/v2/design/interface/index_cn.rst
+多语言接口
+------------
+.. toctree::
+  :maxdepth: 1
+  00.why_plain_c.md
--- a/doc/v2/design/interface/index_en.rst
+++ b/doc/v2/design/interface/index_en.rst
+Multilingual Interface
+-----------------------
+.. toctree::
+  :maxdepth: 1
+  00.why_plain_c.md
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -44,7 +44,7 @@ MKL，MKLML以及MKL-DNN三者关系如下表：
 | Name        |  Open Source     | License     | Descriptions  |
 | :---------- | :--------------- | :---------- | :------------ |
-|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKL       |     No           | Proprietary | Accelerate math processing routines |
 |   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
 |   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
@@ -89,7 +89,7 @@ PaddlePaddle/Paddle
 ### CMake
 在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
- `WITH_MKLML` 控制是否使用MKLML库。 
+- `WITH_MKLML` 控制是否使用MKLML库。
 当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
 编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
 MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
@@ -172,7 +172,7 @@ if use_mkldnn
    self.layer_type = mkldnn_*
 ```
-所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。
 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
  add_subdirectory(cuda)
  add_subdirectory(function)
  add_subdirectory(utils)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
+add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -100,7 +104,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op

--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -87,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> {
    return value;
  }
+  std::shared_ptr<QueueMessage> get_first_message(
+      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
+    while (!queue.empty()) {
+      // Check whether this message was added by Select
+      // If this was added by Select then execute the callback
+      // to check if you can execute this message. The callback
+      // can return false if some other case was executed in Select.
+      // In that case just discard this QueueMessage and process next.
+      std::shared_ptr<QueueMessage> m = queue.front();
+      queue.pop_front();
+      if (m->callback == nullptr || m->callback(action)) return m;
+    }
+    return nullptr;
+  }
  size_t cap_;
  std::recursive_mutex mu_;
  bool closed_;
@@ -131,36 +146,21 @@ void ChannelImpl<T>::Send(T *item) {
  // If there is a receiver, directly pass the value we want
  // to send to the receiver, bypassing the channel buffer if any
  if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
+    std::shared_ptr<QueueMessage> m =
-    recvq.pop_front();
+        get_first_message(recvq, ChannelAction::SEND);
-    // Do the data transfer
-    // We will do this data transfer if either of the following
+    if (m != nullptr) {
-    // cases are true
-    // 1. callback == nullptr // This means it was a regular channel send
-    // 2. callback returns true
-    bool do_send = true;
-    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
-    if (do_send)
      *(m->data) = std::move(*item);
-    else {
+      m->Notify();
-      // We cannot do the data transfer because
+      lock.unlock();
-      // this QueueMessage was added by Select
+      send_return();
-      // and some other case was executed.
+      return;
-      // So call the Send function again.
+    } else {
-      // We do not care about notifying other
-      // because they would have been notified
-      // by the executed select case.
      lock.unlock();
      Send(item);
      send_return();
      return;
    }
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
-    send_return();
-    return;
  }
  // Unbuffered channel will always bypass this
@@ -201,32 +201,34 @@ bool ChannelImpl<T>::Receive(T *item) {
  }
  // If there is a sender, directly receive the value we want
-  // from the sender, bypassing the channel buffer if any
+  // from the sender. In case of a buffered channel, read from
+  // buffer and move front of send queue to the buffer
  if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
+    std::shared_ptr<QueueMessage> m =
-    sendq.pop_front();
+        get_first_message(sendq, ChannelAction::RECEIVE);
-    // Do the data transfer
+    if (buf_.size() > 0) {
-    // We will do this data transfer if either of the following
+      // Case 1 : Channel is Buffered
-    // cases are true
+      // Do Data transfer from front of buffer
-    // 1. callback == nullptr // This means it was a regular channel send
+      // and add a QueueMessage to the buffer
-    // 2. callback returns true
+      *item = std::move(buf_.front());
-    bool do_receive = true;
+      buf_.pop_front();
-    if (m->callback != nullptr)
+      // If first message from sendq is not null
-      do_receive = m->callback(ChannelAction::RECEIVE);
+      // add it to the buffer and notify it
-    if (do_receive)
+      if (m != nullptr) {
-      *item = std::move(*(m->data));
+        // Copy to buffer
-    else
+        buf_.push_back(std::move(*(m->data)));
-      // We cannot do the data transfer because
+        m->Notify();
-      // this QueueMessage was added by Select
+      }  // Ignore if there is no first message
-      // and some other case was executed.
+    } else {
-      // So call the Receive function again.
+      // Case 2: Channel is Unbuffered
-      // We do not care about notifying other
+      // Do data transfer from front of SendQ
-      // because they would have been notified
+      // If front is nullptr, then recursively call itself
-      // by the executed select case.
+      if (m != nullptr) {
-      return recv_return(Receive(item));
+        *item = std::move(*(m->data));
+        m->Notify();
-    // Wake up the blocked process and unlock
+      } else
-    m->Notify();
+        return recv_return(Receive(item));
+    }
    lock.unlock();
    return recv_return(true);
  }

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -36,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
  delete ch;
 }
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
  unsigned sum_send = 0;
  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
+    for (int i = 0; i < num_items; i++) {
      ch->Send(&i);
      sum_send += i;
    }
  });
-  for (int i = 0; i < 5; i++) {
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-    int recv = 999;
+  for (int i = 0; i < num_items; i++) {
+    int recv = -1;
    EXPECT_EQ(ch->Receive(&recv), true);
    EXPECT_EQ(recv, i);
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(200));
  CloseChannel(ch);
  t.join();
-  EXPECT_EQ(sum_send, 10U);
+  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
+  EXPECT_EQ(sum_send, expected_sum);
  delete ch;
 }
@@ -185,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
 TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
  auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 20);
+}
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is less than size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 5);
+}
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is equal to size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 10);
 }
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is greater than the size of buffer
  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 20);
 }
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
+cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+        dynload_cuda)
+cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+if(WITH_GPU)
+    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+else()
+    set(multi_devices_graph_builder_deps)
+endif()
+cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
+        simple_threadpool device_context)
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                                         platform::Place place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      scope_(scope),
+      place_(place) {}
+void ComputationOpHandle::RunImpl() {
+  auto *cur_ctx = dev_ctxes_[place_];
+  for (auto *in : inputs_) {
+    bool need_wait =
+        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    if (need_wait) {
+      in->generated_op_->Wait(cur_ctx);
+    }
+  }
+  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+}
+std::string ComputationOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct ComputationOpHandle : public OpHandleBase {
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
+  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                      platform::Place place);
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+                             std::vector<Scope *> *local_scopes)
+    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+FetchOpHandle::~FetchOpHandle() {
+  for (auto *input_var : inputs_) {
+    input_var->pending_ops_.erase(this);
+  }
+}
+void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
+  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+}
+void FetchOpHandle::WaitAndMergeCPUTensors() const {
+  std::vector<const LoDTensor *> tensors_ptr;
+  tensors_ptr.reserve(tensors_.size());
+  for (auto &t : tensors_) {
+    tensors_ptr.emplace_back(&t);
+  }
+  data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+}
+void FetchOpHandle::RunImpl() {
+  auto cpu_ctx =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  for (auto *input : inputs_) {
+    auto *var = static_cast<VarHandle *>(input);
+    var->generated_op_->Wait(cpu_ctx);
+  }
+  tensors_.resize(inputs_.size());
+  auto *var = static_cast<VarHandle *>(inputs_[0]);
+  auto &var_name = var->name_;
+  platform::CPUPlace cpu;
+  auto &scopes = *local_scopes_;
+  for (size_t i = 0; i < scopes.size(); ++i) {
+    auto &scope = scopes[i];
+    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(var->place_)) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
+      dev_ctxes_[t.place()]->Wait();
+#endif
+    } else {
+      tensors_[i].ShareDataWith(t);
+      tensors_[i].set_lod(t.lod());
+    }
+  }
+  this->WaitAndMergeCPUTensors();
+}
+std::string FetchOpHandle::Name() const { return "Fetch"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct FetchOpHandle : public OpHandleBase {
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
+  FetchOpHandle(FeedFetchList *data, size_t offset,
+                std::vector<Scope *> *local_scopes);
+  ~FetchOpHandle();
+  void Wait(platform::DeviceContext *waited_dev) override;
+  void WaitAndMergeCPUTensors() const;
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+#endif
+namespace paddle {
+namespace framework {
+namespace details {
+#ifdef PADDLE_WITH_CUDA
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes,
+    platform::NCCLContextMap *nccl_ctxs)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes),
+      nccl_ctxs_(nccl_ctxs) {
+#else
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes) {
+#endif
+  for (auto &p : params) {
+    grad_names_.insert(GradVarName(p));
+  }
+}
+std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
+    const ProgramDesc &program) const {
+  auto graph = new SSAGraph();
+  SSAGraph &result = *graph;
+  result.vars_.resize(places_.size());
+  bool is_forwarding = true;
+  for (auto *op : program.Block(0).AllOps()) {
+    bool change_forward = false;
+    if (!is_forwarding) {
+      // FIXME(yy): Do not hard code like this
+      if (op->OutputArgumentNames().size() == 1 &&
+          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
+        continue;  // Drop fill 1. for backward coeff;
+      }
+    }
+    for (size_t i = 0; i < places_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
+      auto *op_handle = result.ops_.back().get();
+      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(p));
+      auto var_names = op->InputArgumentNames();
+      for (auto &each_var_name : var_names) {
+        VarHandle *var =
+            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
+        op_handle->AddInput(var);
+      }
+      var_names = op->OutputArgumentNames();
+      for (auto &each_var_name : var_names) {
+        CreateOpOutput(&result, op_handle, each_var_name, p, i);
+      }
+      if (is_forwarding) {
+        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
+#else
+          auto *communication_dev_ctx =
+              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
+                                                communication_dev_ctx);
+          result.ops_.emplace_back(op_handle);
+          // FIXME: Currently ScaleLossGradOp only use device_count as scale
+          // factor. So it does not depend on any other operators.
+          // VarHandle *loss = GetVarHandle(loss_var_name, place);
+          // loss->pending_ops_.emplace_back(op_handle);
+          // op_handle->inputs_.emplace_back(loss);
+          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
+          change_forward = true;
+        }
+      }
+    }
+    if (change_forward) {
+      is_forwarding = false;
+    }
+    if (!is_forwarding) {
+      auto var_names = op->OutputArgumentNames();
+      for (auto &og : var_names) {
+        if (grad_names_.count(og) != 0) {  // is param grad
+                                           // Insert NCCL AllReduce Op
+#ifdef PADDLE_WITH_CUDA
+          result.ops_.emplace_back(
+              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+          auto *op_handle = result.ops_.back().get();
+          for (size_t i = 0; i < places_.size(); ++i) {
+            auto &p = places_[i];
+            auto &vars = result.vars_[i][og];
+            if (vars.empty()) {  // This device has no data. continue.
+              continue;
+            }
+            auto *prev_grad = &vars[vars.size() - 1];
+            op_handle->AddInput(prev_grad);
+            auto &var = vars[vars.size()];
+            var.place_ = p;
+            var.name_ = og;
+            var.version_ = vars.size() - 1;
+            op_handle->AddOutput(&var);
+          }
+#else
+          PADDLE_ENFORCE("Not implemented");
+#endif
+        }
+      }
+    }
+  }
+  /*
+    Dependency graph has been constructed. However, there are still data
+    harzaeds need to be handled.
+   */
+  PolishGraphToSupportDataHazards(&result);
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    PrintGraphviz(*graph, sout);
+    VLOG(10) << sout.str();
+  }
+  return std::unique_ptr<SSAGraph>(graph);
+}  // namespace details
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace platform {
+class NCCLContextMap;
+}
+namespace framework {
+class Scope;
+namespace details {
+class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes,
+                          platform::NCCLContextMap *nccl_ctxs);
+#else
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes);
+#endif
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+ private:
+  std::string loss_var_name_;
+  const std::vector<platform::Place> &places_;
+  const std::vector<Scope *> &local_scopes_;
+  std::unordered_set<std::string> grad_names_;
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nccl_ctxs_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+  }
+}
+void NCCLAllReduceOpHandle::RunImpl() {
+  if (inputs_.size() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    for (auto *in : inputs_) {
+      auto &p = static_cast<VarHandle *>(in)->place_;
+      in->generated_op_->Wait(dev_ctxes_[p]);
+    }
+    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    int dtype = -1;
+    size_t numel = 0;
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
+      }
+      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+    platform::NCCLGroupGuard guard;
+    for (auto &call : all_reduce_calls) {
+      call();
+    }
+  }
+}
+std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct NCCLAllReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
+  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                        const std::vector<platform::Place> &places,
+                        const platform::NCCLContextMap &ctxs);
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/op_handle_base.h"
+namespace paddle {
+namespace framework {
+namespace details {
+std::string OpHandleBase::DebugString() const {
+  std::stringstream ss;
+  ss << "(";
+  for (auto *var : inputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ") --> (";
+  for (auto *var : outputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ")\n";
+  return ss.str();
+}
+OpHandleBase::~OpHandleBase() {
+#ifdef PADDLE_WITH_CUDA
+  for (auto &ev : events_) {
+    PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+  }
+#endif
+}
+void OpHandleBase::Run(bool use_event) {
+#ifdef PADDLE_WITH_CUDA
+  if (events_.empty() && use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      PADDLE_ENFORCE(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE(
+          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+    }
+  }
+#else
+  PADDLE_ENFORCE(!use_event);
+#endif
+  RunImpl();
+#ifdef PADDLE_WITH_CUDA
+  if (use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
+      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
+    }
+  }
+#endif
+}
+void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+    for (auto &dev_ctx : dev_ctxes_) {
+      dev_ctx.second->Wait();
+    }
+  } else {
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
+    for (auto &ev : events_) {
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+    }
+  }
+#else
+  for (auto &dev_ctx : dev_ctxes_) {
+    dev_ctx.second->Wait();
+  }
+#endif
+}
+void OpHandleBase::AddInput(VarHandleBase *in) {
+  this->inputs_.emplace_back(in);
+  in->pending_ops_.insert(this);
+}
+void OpHandleBase::AddOutput(VarHandleBase *out) {
+  outputs_.emplace_back(out);
+  out->generated_op_ = this;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase {
+ private:
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
+ public:
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+  OpHandleBase() {}
+  std::string DebugString() const;
+  virtual std::string Name() const = 0;
+  virtual ~OpHandleBase();
+  void Run(bool use_event);
+  virtual void Wait(platform::DeviceContext *waited_dev);
+  void AddInput(VarHandleBase *in);
+  void AddOutput(VarHandleBase *out);
+ protected:
+  virtual void RunImpl() = 0;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+                                             platform::Place place,
+                                             platform::DeviceContext *dev_ctx)
+    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+  dev_ctxes_[place_] = dev_ctx;
+}
+ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
+void ScaleLossGradOpHandle::RunImpl() {
+  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  float *tmp =
+      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
+          make_ddim({1}), place_);
+  if (platform::is_cpu_place(place_)) {
+    *tmp = coeff_;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+            ->stream();
+    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+#endif
+  }
+}
+std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct ScaleLossGradOpHandle : public OpHandleBase {
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
+  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+                        platform::DeviceContext *context);
+  ~ScaleLossGradOpHandle() final;
+  std::string Name() const override;
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph.cc
+++ b/paddle/fluid/framework/details/ssa_graph.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph.h"
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ b/paddle/fluid/framework/details/ssa_graph.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph {
+  std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_;
+  // aux variables to represent dependency. Useful to resolve data hazard.
+  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
+  std::vector<std::unique_ptr<OpHandleBase>> ops_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace framework {
+namespace details {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        auto *write_op = it_new->second.generated_op_;
+        auto &read_ops = it_old->second.pending_ops_;
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+          auto *dep_var = new DummyVarHandle();
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->dep_vars_.emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
+    SSAGraph *graph, const std::string &each_var_name,
+    const platform::Place &place, size_t place_offset) {
+  auto &var_holders = graph->vars_[place_offset];
+  auto &var_holder = var_holders[each_var_name];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    auto &init_var = var_holder[0];
+    init_var.place_ = place;
+    init_var.name_ = each_var_name;
+    init_var.generated_op_ = nullptr;
+    init_var.version_ = 0;
+    var = &init_var;
+  } else {
+    var = &var_holder.rbegin()->second;
+  }
+  return var;
+}
+void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                                     const std::string &each_var_name,
+                                     const platform::Place &place,
+                                     size_t place_offset) {
+  auto &vars = graph->vars_[place_offset][each_var_name];
+  size_t version = vars.size();
+  auto &var = vars[version];
+  var.version_ = version;
+  var.name_ = each_var_name;
+  var.place_ = place;
+  op_handle->AddOutput(&var);
+}
+template <typename Callback>
+void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(pair2.second);
+      }
+    }
+  }
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+  sout << "digraph G {\n";
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->inputs_) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+    for (auto out : op->outputs_) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/place.h"
+#include <memory>
+#include <string>
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraphBuilder {
+ public:
+  SSAGraphBuilder() {}
+  virtual ~SSAGraphBuilder() {}
+  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
+ protected:
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
+  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
+                                               const std::string &each_var_name,
+                                               const platform::Place &place,
+                                               size_t place_offset);
+  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                             const std::string &each_var_name,
+                             const platform::Place &place, size_t place_offset);
+  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+namespace paddle {
+namespace framework {
+namespace details {
+SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
+    : graph_(std::move(graph)) {}
+SSAGraphExecutor::~SSAGraphExecutor() {}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraphExecutor {
+  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
+ public:
+  // Steal graph inside
+  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
+  virtual ~SSAGraphExecutor();
+  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+ protected:
+  std::unique_ptr<SSAGraph> graph_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
+    size_t num_threads, bool use_event,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<SSAGraph> &&graph)
+    : SSAGraphExecutor(std::move(graph)),
+      pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
+      local_scopes_(local_scopes),
+      places_(places),
+      fetch_ctxs_(places),
+      use_event_(use_event) {}
+FeedFetchList ThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  BlockingQueue<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+  auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
+    pending_vars.insert(&var);
+    if (var.generated_op_ == nullptr) {
+      ready_vars.Push(&var);
+    }
+  };
+  auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
+    pending_ops.insert({&op_instance, op_instance.inputs_.size()});
+  };
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(version_pair.second);
+      }
+    }
+  }
+  for (auto &var : graph_->dep_vars_) {
+    InsertPendingVar(*var);
+  }
+  for (auto &op : graph_->ops_) {
+    if (op->inputs_.empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op.get());
+    } else {
+      InsertPendingOp(*op);
+    }
+  }
+  // Step 2. Insert FetchOps
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<DummyVarHandle> dummy_vars;
+  FeedFetchList fetch_data(fetch_tensors.size());
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->vars_) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second);
+      }
+    }
+  }
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto &vars = fetched_vars.at(var_name);
+    auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
+    fetch_ops.emplace_back(op);
+    // FIXME: Use new device context
+    for (auto &p : places_) {
+      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
+    }
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+    InsertPendingOp(*op);
+  }
+  auto run_all_ready_ops = [&] {
+    for (auto *op : ready_ops) {
+      RunOp(ready_vars, op);
+    }
+    ready_ops.clear();
+  };
+  // Create local scopes.
+  for (auto &scope : local_scopes_) {
+    auto &local_scope = scope->NewScope();
+    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
+  }
+  // Step 3. Execution
+  while (!pending_vars.empty()) {
+    // 1. Run All Ready ops
+    run_all_ready_ops();
+    // 2. Find ready variable
+    bool timeout;
+    auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
+    if (timeout) {
+      if (exception_) {
+        throw * exception_;
+      } else {
+        continue;
+      }
+    }
+    // 3. Remove the dependency of ready_var.
+    // Find the ready_ops after the ready_var.
+    for (auto ready_var : cur_ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = pending_ops[op];
+        --deps;
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    // Keep loop until all vars are ready.
+  }
+  ++computation_count_;
+  auto sync_computation = [&] {
+    computation_count_ = 0;
+    // Wait All computational streams
+    for (auto p : this->places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      scope->DropKids();
+    }
+  };
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+    sync_computation();
+  }
+  if (computation_count_ == max_async_computation) {
+    sync_computation();
+  }
+  // NOTE: the temp scope can be dropped lazily if needed.
+  // Drop tmp scopes;
+  for (auto &scope : local_scopes_) {
+    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
+    kid = nullptr;
+  }
+  return fetch_data;
+}
+void ThreadedSSAGraphExecutor::RunOp(
+    BlockingQueue<VarHandleBase *> &ready_var_q, details::OpHandleBase *op) {
+  auto op_run = [&ready_var_q, op, this] {
+    try {
+      VLOG(10) << op->Name() << " : " << op->DebugString();
+      op->Run(use_event_);
+      ready_var_q.Extend(op->outputs_);
+    } catch (platform::EnforceNotMet ex) {
+      exception_.reset(new platform::EnforceNotMet(ex));
+    } catch (...) {
+      LOG(FATAL) << "Unknown exception catched";
+    }
+  };
+  if (pool_) {
+    pool_->enqueue(op_run);
+  } else {
+    op_run();
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <functional>
+#include "ThreadPool.h"  // ThreadPool in thrird party
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+template <typename T>
+class BlockingQueue {
+ public:
+  void Push(const T &item) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      q_.emplace_back(item);
+    }
+    cv_.notify_one();
+  }
+  template <typename U>
+  void Extend(const U &items) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      for (auto &item : items) {
+        q_.emplace_back(item);
+      }
+    }
+    cv_.notify_all();
+  }
+  std::deque<T> PopAll(size_t ms, bool *timeout) {
+    auto time =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
+    std::unique_lock<std::mutex> lock(mutex_);
+    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
+    std::deque<T> ret;
+    if (!*timeout) {
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::deque<T> q_;
+};
+class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::unique_ptr<SSAGraph> &&graph);
+  // Run a SSAGraph by a thread pool
+  // Use topological sort algorithm
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+  ~ThreadedSSAGraphExecutor() {}
+ private:
+  void RunOp(BlockingQueue<VarHandleBase *> &ready_var_q,
+             details::OpHandleBase *op);
+ private:
+  std::unique_ptr<::ThreadPool> pool_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  platform::DeviceContextPool fetch_ctxs_;
+  const bool use_event_;
+  std::unique_ptr<platform::EnforceNotMet> exception_;
+  size_t computation_count_{0};
+  size_t max_async_computation{100};
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/var_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+VarHandleBase::~VarHandleBase() {}
+std::string VarHandle::DebugString() const {
+  std::stringstream ss;
+  ss << name_ << ":" << place_;
+  return ss.str();
+}
+std::string DummyVarHandle::DebugString() const { return "dummy"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct OpHandleBase;
+// VarHandleBase is the var node in the dependency graph.
+// A variable can only be generated by a single operator. i.e.
+// This is a single assignment graph.
+struct VarHandleBase {
+  virtual ~VarHandleBase();
+  virtual std::string DebugString() const = 0;
+  // The operator who generate this variable. nullptr if the variable
+  // is a root node.
+  OpHandleBase *generated_op_;
+  // Operators which depend on this variable ready.
+  std::unordered_set<OpHandleBase *> pending_ops_;
+};
+// VarHandle is actually a single version of Runtime Variable.
+// Variable in Runtime mapped to many VarHandles in Graph.
+// Each assignment will generate a new var handle with newer version.
+//
+// NOTE: runtime variables have place.
+struct VarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+  // version field currently is not used, however, just store the version to
+  // debug easily.
+  size_t version_;
+  std::string name_;
+  platform::Place place_;
+};
+// Dummy Variable. It is used to represent dependencies between operators
+struct DummyVarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,7 +46,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
-static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  if (var_type == proto::VarType::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
  } else if (var_type == proto::VarType::SELECTED_ROWS) {
@@ -294,12 +294,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
        if (var->Persistable()) {
          auto* ptr = scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
+          InitializeVariable(ptr, var->GetType());
          VLOG(3) << "Create Variable " << var->Name()
                  << " global, which pointer is " << ptr;
        } else {
          auto* ptr = local_scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
+          InitializeVariable(ptr, var->GetType());
          VLOG(3) << "Create Variable " << var->Name()
                  << " locally, which pointer is " << ptr;
        }
@@ -307,7 +307,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    } else {
      for (auto& var : block.AllVars()) {
        auto* ptr = local_scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
+        InitializeVariable(ptr, var->GetType());
        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
                << ptr;
      }

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 struct ExecutorPrepareContext {
  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "ThreadPool.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+namespace paddle {
+namespace framework {
+class ParallelExecutorPrivate {
+ public:
+  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
+      : places_(places) {}
+  std::vector<platform::Place> places_;
+  std::vector<Scope *> local_scopes_;
+  Scope *global_scope_;
+  std::unique_ptr<details::SSAGraphExecutor> executor_;
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+};
+ParallelExecutor::ParallelExecutor(
+    size_t num_threads, bool use_event,
+    const std::vector<platform::Place> &places,
+    const std::unordered_set<std::string> &params,
+    const ProgramDesc &startup_program, const ProgramDesc &main_program,
+    const std::string &loss_var_name, Scope *scope)
+    : member_(new ParallelExecutorPrivate(places)) {
+  member_->global_scope_ = scope;
+  // Step 1. RunStartupProgram and Bcast the params to devs.
+  Executor exe(places[0]);
+  exe.Run(startup_program, scope, 0);
+  // Create local scopes
+  for (size_t i = 0; i < member_->places_.size(); ++i) {
+    member_->local_scopes_.push_back(&scope->NewScope());
+  }
+// Bcast Parameters to all GPUs
+#ifdef PADDLE_WITH_CUDA
+  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
+#endif
+  if (platform::is_gpu_place(places[0]) &&
+      member_->local_scopes_.size() != 1) {  // Is CUDA
+    BCastParamsToGPUs(startup_program);
+  }
+// Startup Program has been run. All local scopes has correct parameters.
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
+#ifdef PADDLE_WITH_CUDA
+  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+                                           params, member_->local_scopes_,
+                                           member_->nccl_ctxs_.get());
+#else
+  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+                                           params, member_->local_scopes_);
+#endif
+  auto graph = builder.Build(main_program);
+  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+      num_threads, use_event, member_->local_scopes_, places,
+      std::move(graph)));
+  // Step 3. Create vars in each scope;
+  for (auto *scope : member_->local_scopes_) {
+    for (auto *var : main_program.Block(0).AllVars()) {
+      if (scope->FindVar(var->Name()) != nullptr) {
+        continue;
+      }
+      InitializeVariable(scope->Var(var->Name()), var->GetType());
+    }
+  }
+}
+void ParallelExecutor::BCastParamsToGPUs(
+    const ProgramDesc &startup_program) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *main_scope = member_->local_scopes_[0];
+  for (auto *var_desc : startup_program.Block(0).AllVars()) {
+    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
+      auto &main_tensor =
+          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      auto &dims = main_tensor.dims();
+      size_t numel = main_tensor.numel();
+      platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
+        if (i == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                     nccl_ctx.comm_, nccl_ctx.stream());
+      }
+    }
+    member_->nccl_ctxs_->WaitAll();
+  }
+#else
+  PADDLE_THROW("Not compiled with CUDA");
+#endif
+}
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
+  auto fetch_data = member_->executor_->Run(fetch_tensors);
+  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
+      fetch_data;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <future>
+#include <unordered_set>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+class ParallelExecutorPrivate;
+class ParallelExecutor {
+  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
+ public:
+  explicit ParallelExecutor(size_t num_threads, bool use_event,
+                            const std::vector<platform::Place>& places,
+                            const std::unordered_set<std::string>& params,
+                            const ProgramDesc& startup_program,
+                            const ProgramDesc& main_program,
+                            const std::string& loss_var_name, Scope* scope);
+  void Run(const std::vector<std::string>& fetch_tensors,
+           const std::string& fetched_var_name = "fetched_var");
+ private:
+  ParallelExecutorPrivate* member_;
+  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -29,7 +29,7 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
    for (int j = 0; j < actual.size(); ++j) {
-      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
    }
  }
 }

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -32,6 +32,8 @@ namespace framework {
 // number of threads.
 class ThreadPool {
 public:
+  explicit ThreadPool(int num_threads);
  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
  // Returns the singleton of ThreadPool.
@@ -103,8 +105,6 @@ class ThreadPool {
  DISABLE_COPY_AND_ASSIGN(ThreadPool);
-  explicit ThreadPool(int num_threads);
  // If the task queue is empty and avaialbe is equal to the number of
  // threads, means that all tasks are completed.  Note: this function
  // is not thread-safe.  Returns true if all tasks are completed.

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -79,7 +79,18 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // if size is 0.  We just make sure it does.
  if (size <= 0) return nullptr;
  void* p;
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(gpu_id_);
+  }
  cudaError_t result = cudaMalloc(&p, size);
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(prev_id);
+  }
  if (result == cudaSuccess) {
    index = 0;
    gpu_alloc_size_ += size;

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -43,6 +43,8 @@ class CPUAllocator : public SystemAllocator {
 #ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
 public:
+  explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
  virtual void* Alloc(size_t& index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;
@@ -50,6 +52,7 @@ class GPUAllocator : public SystemAllocator {
 private:
  size_t gpu_alloc_size_ = 0;
  size_t fallback_alloc_size_ = 0;
+  int gpu_id_;
 };
 class CUDAPinnedAllocator : public SystemAllocator {

--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -58,7 +58,7 @@ TEST(CPUAllocator, LockMem) {
 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
-  paddle::memory::detail::GPUAllocator a;
+  paddle::memory::detail::GPUAllocator a(0);
  TestAllocator(a, 2048);
  TestAllocator(a, 0);
 }

--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -71,7 +71,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  }
  platform::SetDeviceId(gpu_id);
  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
                                    platform::GpuMinChunkSize(),
                                    platform::GpuMaxChunkSize());
    VLOG(10) << "\n\nNOTE: each GPU device use "

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -183,6 +183,8 @@ if(WITH_DISTRIBUTE)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
@@ -191,9 +193,9 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 op_library(cond_op DEPS framework_proto tensor net_op)

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -260,6 +260,36 @@ $out = floor(x)$
  }
 };
+class CosOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cosine operator");
+    AddOutput("Out", "Output of Cosine operator");
+    AddComment(R"DOC(
+Cosine Activation Operator.
+$out = cos(x)$
+)DOC");
+  }
+};
+class SinOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sine operator");
+    AddOutput("Out", "Output of Sine operator");
+    AddComment(R"DOC(
+Sine Activation Operator.
+$out = sin(x)$
+)DOC");
+  }
+};
 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -561,6 +591,12 @@ REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
 REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
            ops::ActivationOpGrad);
+REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
+            ops::ActivationOpGrad);
 REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
            ops::ActivationOpGrad);

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -331,6 +331,54 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
  }
 };
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+};
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+};
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -782,6 +830,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(sin, SinFunctor, SinGradFunctor);                          \
  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
  __macro(log, LogFunctor, LogGradFunctor);                          \

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -88,10 +88,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
  const auto ch = GetChannel(ep_val);
  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-    // varhandle
+    // var handle
    VarHandle var_h;
    var_h.ep = ep_val;
    var_h.scope = p_scope;
@@ -103,9 +106,6 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    s->Prepare(var_h, time_out);
    s->response_call_back_ = ProcGetResponse;
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
@@ -117,6 +117,48 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
  return true;
 }
+bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& in_var_name,
+                                      const std::string& out_var_name,
+                                      int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                    time_out, ch, this] {
+    auto* var = p_scope->FindVar(in_var_name_val);
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = out_var_name_val;
+    var_h.ctx = p_ctx;
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+  req_count_++;
+  return true;
+}
 void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  const auto ch = GetChannel(ep);
@@ -204,7 +246,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
  }
  grpc::ChannelArguments args;
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000);
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -172,6 +172,13 @@ class RPCClient {
                        const std::string& var_name,
                        int64_t time_out = 600 * 1000);
+  bool AsyncPrefetchVariable(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& in_var_name,
+                             const std::string& out_var_name,
+                             int64_t time_out = 600 * 1000);
  void AsyncSendBatchBarrier(const std::string& ep,
                             int64_t time_out = 600 * 1000);

--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -59,12 +59,12 @@ message VariableMessage {
  // lod details:
  int64 lod_level = 5;
  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  int64 slr_height = 7;
  // tensor data
-  bytes serialized = 7;
+  bytes serialized = 8;
  // selected_rows data
-  bytes rows = 8;
+  bytes rows = 9;
 }
 message VoidMessage {}
-message TestMessage { int64 test_1 = 1; }
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
      }
      e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
+      e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height());
      auto* tensor = slr->mutable_value();
      if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
    ProtoEncodeHelper e2((char*)buf, 128);
    // NOTE: rows is of type int64_t
    size_t rows_memory_size =
-        slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
    slices[2] = ::grpc::Slice(e2.size());
    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());

--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <sys/time.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -35,6 +36,12 @@ namespace detail {
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+static int64_t GetTimestamp() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
+}
 typedef void (*DestroyCallback)(void*);
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,

--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // serialize var to ByteBuffer
  framework::Variable var;
  auto* slr = var.GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
  auto* tensor = slr->mutable_value();
  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({2, 10}));
+  tensor->Resize(framework::make_ddim({564, 128}));
  tensor->mutable_data<float>(place);
-  int tensor_numel = 2 * 10;
+  int tensor_numel = 564 * 128;
  math::set_constant(ctx, tensor, 32.7);
-  rows->push_back(3);
+  for (int i = 0; i < 564; ++i) rows->push_back(i);
-  rows->push_back(10);
  ::grpc::ByteBuffer msg;
  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  sendrecv::VariableMessage varmsg;
  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+  // deserialize bytebuffer
  EXPECT_EQ(varmsg.varname(), "myvar");
  EXPECT_EQ(varmsg.type(), 1);
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  for (int i = 0; i < tensor_numel; ++i) {
    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
  }
-  EXPECT_EQ(rows_data[0], 3);
+  for (int i = 0; i < 564; ++i) {
-  EXPECT_EQ(rows_data[1], 10);
+    EXPECT_EQ(rows_data[i], i);
+  }
  // deserialize zero-copy
  // framework::Variable var2;
  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  for (int i = 0; i < tensor_numel; ++i) {
    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
  }
-  EXPECT_EQ(rows_data2[0], 3);
+  for (int i = 0; i < rows2->size(); ++i) {
-  EXPECT_EQ(rows_data2[1], 10);
+    EXPECT_EQ(rows_data2[i], i);
+  }
+  EXPECT_EQ(slr2->height(), 1000);
 }
 void RunTestLodTensor(platform::Place place, int from_type = 0) {

--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -147,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
  auto var = scope_->FindVar(meta_.varname());
  auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->set_height(meta_.slr_height());
  auto* tensor = slr->mutable_value();
  tensor->Resize(dims);
+  PADDLE_ENFORCE_EQ(
+      tensor->numel(),
+      length / framework::SizeOfType(
+                   paddle::operators::detail::ToTypeIndex(meta_.data_type())));
  void* tensor_data = tensor->mutable_data(
      ctx.GetPlace(),
      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
@@ -165,7 +170,8 @@ bool VariableResponse::CopySelectRowsData(
    const platform::DeviceContext& ctx, int length) {
  auto var = scope_->FindVar(meta_.varname());
  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->resize(length / 8);  // int64
+  slr->mutable_rows()->resize(length /
+                              framework::SizeOfType(typeid(int64_t)));  // int64
  int64_t* rows_data = slr->mutable_rows()->data();
  // copy rows CPU data, GPU data will be copied lazily.
@@ -348,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
        }
        break;
      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
      case sendrecv::VariableMessage::kSerializedFieldNumber: {
        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
                        meta_.type() == sendrecv::LOD_TENSOR) &&

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase {
      // and this will still work.
      std::vector<std::future<void>> fs;
+      double ts = detail::GetTimestamp();
      // block0 contains only listen_and_serv op, start run from block1.
      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
        fs.push_back(
@@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase {
          LOG(ERROR) << "run sub program error " << e.what();
        }
      }
+      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
      // Reset the received sparse variables, the sum operator would not
      // sum the input sparse variables which rows is empty at the next

--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
 namespace paddle {

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -19,8 +19,17 @@ namespace paddle {
 namespace operators {
 namespace math {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolFunctor {
 public:
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::LoDTensor& input, framework::Tensor* output,
@@ -60,7 +69,7 @@ class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
 };
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolGradFunctor {
 public:
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& out_grad,
@@ -93,10 +102,101 @@ class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
  }
 };
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template <typename T>
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+class SequencePoolFunctor<platform::CPUDeviceContext, T> {
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+ public:
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
+  /* max pool has index output */
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output,
+                  framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<T> max_pool;
+      max_pool(context, input, output, index);
+      return;
+    }
+    auto lod = input.lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor in_t =
+          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      Tensor out_t = output->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = input.numel() / input.dims()[0];
+      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+      if (pooltype == "AVERAGE") {
+        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                              std::sqrt(static_cast<T>(h));
+      } else if (pooltype == "LAST") {
+        out_e.device(place) = in_e.chip(h - 1, 0);
+      } else if (pooltype == "FIRST") {
+        out_e.device(place) = in_e.chip(0, 0);
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+template <typename T>
+class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
+      max_pool_grad(context, out_grad, *index, in_grad);
+      return;
+    }
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(context, in_grad, 0);
+    }
+    auto lod = in_grad->lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
+                                   static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_grad.Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = in_grad->numel() / in_grad->dims()[0];
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(place) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 namespace paddle {
 namespace operators {
@@ -22,113 +23,331 @@ namespace math {
 #define FLT_MAX __FLT_MAX__
 template <typename T>
-__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+struct MaxPoolFunctor {
-                                  T* output, int* index, int64_t num_seq,
+  HOSTDEVICE void operator()(const T* input, const size_t start,
-                                  int64_t dim) {
+                             const size_t end, const size_t item_dim, T* output,
-  int dim_idx = threadIdx.x;
+                             int* index) {
-  int seq_id = blockIdx.x;
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-  if (seq_id >= num_seq) return;
+      T max_val = static_cast<T>(-FLT_MAX);
-  size_t start = starts[seq_id];
+      int max_index = -1;
-  size_t end = starts[seq_id + 1];
+      for (int i = start; i < end; ++i) {
+        if (max_val < input[item_dim * i + tid]) {
-  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+          max_val = input[item_dim * i + tid];
-    T max_val = static_cast<T>(-FLT_MAX);
+          max_index = i;
-    int max_id = -1;
+        }
-    for (size_t step_id = start; step_id < end; step_id++) {
-      if (max_val < input[step_id * dim + i]) {
-        max_val = input[step_id * dim + i];
-        max_id = step_id;
      }
+      output[tid] = max_val;
+      index[tid] = max_index;
    }
-    output[seq_id * dim + i] = max_val;
-    index[seq_id * dim + i] = max_id;
  }
-}
+};
 template <typename T>
-class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
+struct AvgPoolFunctor {
- public:
+  HOSTDEVICE void operator()(const T* input, const size_t start,
-  void operator()(const platform::CUDADeviceContext& context,
+                             const size_t end, const size_t item_dim, T* output,
-                  const framework::LoDTensor& input, framework::Tensor* output,
+                             int* index) {
-                  framework::Tensor* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-    auto in_dims = input.dims();
+      T val = static_cast<T>(0);
-    auto out_dims = output->dims();
+      for (int i = start; i < end; ++i) {
-    auto idx_dims = index->dims();
+        val += input[item_dim * i + tid];
-    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+      }
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+      // end, start is lod, so end - start != 0
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      output[tid] = val / static_cast<T>(end - start);
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+  }
+};
-    auto starts = input.lod()[0];
+template <typename T>
-    const T* in_data = input.data<T>();
+struct SumPoolFunctor {
-    T* out_data = output->data<T>();
+  HOSTDEVICE void operator()(const T* input, const size_t start,
-    int* max_index = index->data<int>();
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      output[tid] = val;
+    }
+  }
+};
-    int64_t num_seq = out_dims[0];
+template <typename T>
-    int64_t dim = output->numel() / num_seq;
+struct SqrtPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / sqrt(end - start);
+    }
+  }
+};
-    dim3 threads(256, 1);
+template <typename T>
-    dim3 grid(num_seq, 1);
+struct LastPoolFunctor {
-    auto stream = context.stream();
+  HOSTDEVICE void operator()(const T* input, const size_t start,
-    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+                             const size_t end, const size_t item_dim, T* output,
-        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
+                             int* index) {
-        num_seq, dim);
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      output[tid] = input[item_dim * (end - 1) + tid];
+    }
  }
 };
 template <typename T>
-__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+struct FirstPoolFunctor {
-                                      T* in_grad, int64_t num_seq,
+  HOSTDEVICE void operator()(const T* input, const size_t start,
-                                      int64_t dim) {
+                             const size_t end, const size_t item_dim, T* output,
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+                             int* index) {
-  int col_idx = idx % dim;
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-  if (idx < num_seq * dim) {
+      output[tid] = input[item_dim * start + tid];
-    int step_id = max_index[idx];
+    }
-    in_grad[step_id * dim + col_idx] = out_grad[idx];
  }
+};
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_kernel(Range_OP op, const T* input,
+                                     const size_t* lod, const size_t lod_size,
+                                     const size_t item_dim, T* output,
+                                     int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(input, start, end, item_dim, &output[bid * item_dim], index_offset);
 }
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
+class SequencePoolFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& out_grad,
+                  const std::string pooltype, const framework::LoDTensor& input,
-                  const framework::Tensor& index,
+                  framework::Tensor* output,
-                  framework::LoDTensor* in_grad) {
+                  framework::Tensor* index = nullptr) {
-    auto og_dims = out_grad.dims();
+    auto lod = input.lod()[0];
-    auto idx_dims = index.dims();
+    const size_t item_dim = output->numel() / output->dims()[0];
-    auto ig_dims = in_grad->dims();
+    dim3 threads(1024, 1);
-    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    dim3 grid(lod.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    if (pooltype == "MAX") {
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      sequence_pool_kernel<
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_kernel<
+          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_kernel<
+          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_kernel<
+          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_kernel<
+          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_kernel<
+          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+  }
+};
-    const T* og_data = out_grad.data<T>();
+template <typename T>
-    const int* max_index = index.data<int>();
+struct MaxPoolGradFunctor {
-    T* ig_data = in_grad->data<T>();
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == index[tid]) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
-    SetConstant<platform::CUDADeviceContext, T> set_zero;
+template <typename T>
-    set_zero(context, in_grad, static_cast<T>(0.0));
+struct AvgPoolGradFunctor {
-    int64_t num_seq = og_dims[0];
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-    int64_t dim = out_grad.numel() / num_seq;
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid] / (end - start);
+      }
+    }
+  }
+};
-    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+template <typename T>
-    dim3 threads(128, 1);
+struct SumPoolGradFunctor {
-    dim3 grid(blocks, 1);
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
-    auto stream = context.stream();
+                             const size_t end, const size_t item_dim,
-    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+                             T* in_grad, const int* index) {
-        og_data, max_index, ig_data, num_seq, dim);
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid];
+      }
+    }
+  }
+};
+template <typename T>
+struct SqrtPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] =
+            out_grad[tid] / (sqrt(static_cast<T>(end - start)));
+      }
+    }
+  }
+};
+template <typename T>
+struct LastPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == end - 1) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+template <typename T>
+struct FirstPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == start) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad,
+                                          const size_t* lod,
+                                          const size_t lod_size,
+                                          const size_t item_dim, T* in_grad,
+                                          const int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  const int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset);
+}
+template <typename T>
+class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    auto lod = in_grad->lod()[0];
+    const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
+    dim3 threads(1024, 1);
+    dim3 grid(lod.size(), 1);
+    if (pooltype == "MAX") {
+      sequence_pool_grad_kernel<
+          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_grad_kernel<
+          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_grad_kernel<
+          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_grad_kernel<
+          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_grad_kernel<
+          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_grad_kernel<
+          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
+    }
  }
 };
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+// sequence pooling
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -21,23 +21,23 @@ namespace paddle {
 namespace operators {
 namespace math {
-#define FLT_MAX __FLT_MAX__
 template <typename DeviceContext, typename T>
-class MaxSeqPoolFunctor {
+class SequencePoolFunctor {
 public:
-  void operator()(const DeviceContext& context,
+  /* max pool has index output */
+  void operator()(const DeviceContext& context, const std::string pooltype,
                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index);
+                  framework::Tensor* index = nullptr);
 };
-template <typename DeviceContext, class T>
+template <typename DeviceContext, typename T>
-class MaxSeqPoolGradFunctor {
+class SequencePoolGradFunctor {
 public:
-  void operator()(const DeviceContext& context,
+  void operator()(const DeviceContext& context, const std::string pooltype,
                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad,
-                  framework::LoDTensor* in_grad);
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr);
 };
 }  // namespace math

--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <future>
+#include <ostream>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+namespace paddle {
+namespace operators {
+class PrefetchOp : public framework::OperatorBase {
+ public:
+  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
+                << outs[i] << "back";
+        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
+                                          outs[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrefetchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which will be"
+              "initialized at most once.");
+    AddOutput("Out",
+              "(SelectedRows) result "
+              "to be fetched from parameter server")
+        .AsDuplicable();
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+Prefetch operator
+This operator will send Ids variables to listen_and_serve op at
+the parameter server and fetch result back.
+)DOC");
+  }
+};
+class PrefetchOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+class PrefetchOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
+                  paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
+                  ops::PrefetchOpVarTypeInference,
+                  ops::PrefetchOpShapeInference);
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -59,7 +60,9 @@ class ReadOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
    framework::ReaderHolder* reader =
-        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+        detail::Ref(scope.FindVar(Input("Reader")),
+                    "Cannot find reader variable %s", Input("Reader"))
+            .GetMutable<framework::ReaderHolder>();
    std::vector<std::string> out_arg_names = Outputs("Out");
    std::vector<framework::LoDTensor> ins;
    reader->ReadNext(&ins);

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -81,10 +81,10 @@ class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
      This operator creates a multi-pass reader. A multi-pass reader 
      is used to yield data for several pass training continuously. 
-      It takes the the number of pass to run as one of its attributes
+      It takes the number of passes to run as one of its attributes
      ('pass_num'), and maintains a pass counter to record how many 
-      passes it has completed. When the underlying reader reach the EOF, 
+      passes it has completed. When the underlying reader reaches the 
-      the multi-pass reader checks whether it has completed training 
+      EOF, the multi-pass reader checks whether it has completed training 
      of the given number of pass. If not, the underlying reader will 
      be re-initialized and starts a new pass automatically.
    )DOC");

--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <mutex>
+#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
 namespace paddle {
 namespace operators {
 namespace reader {
+template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
 public:
  explicit RecordIOFileReader(const std::string& filename,
@@ -25,7 +28,12 @@ class RecordIOFileReader : public framework::FileReader {
      : FileReader(dims),
        scanner_(filename),
        dev_ctx_(*platform::DeviceContextPool::Instance().Get(
-            platform::CPUPlace())) {}
+            platform::CPUPlace())) {
+    if (ThreadSafe) {
+      mutex_.reset(new std::mutex());
+    }
+    LOG(INFO) << "Creating file reader" << filename;
+  }
  bool HasNext() const override { return scanner_.HasNext(); }
@@ -33,10 +41,16 @@ class RecordIOFileReader : public framework::FileReader {
 protected:
  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
-    *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    if (ThreadSafe) {
+      std::lock_guard<std::mutex> guard(*mutex_);
+      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    } else {
+      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    }
  }
 private:
+  std::unique_ptr<std::mutex> mutex_;
  recordio::Scanner scanner_;
  const platform::DeviceContext& dev_ctx_;
 };
@@ -59,8 +73,9 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(
-        new RecordIOFileReader(filename, RestoreShapes(shape_concat, ranks)));
+    out->Reset(new RecordIOFileReader<true>(
+        filename, RestoreShapes(shape_concat, ranks)));
  }
 };
@@ -87,4 +102,4 @@ REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
                              reader::CreateRecordIOReaderOp,
                              reader::CreateRecordIOReaderOpMaker);
-REGISTER_FILE_READER(recordio, reader::RecordIOFileReader);
+REGISTER_FILE_READER(recordio, reader::RecordIOFileReader<false>);
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,35 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <future>
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-static bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
 class SendOp : public framework::OperatorBase {
 public:
@@ -72,7 +56,7 @@ class SendOp : public framework::OperatorBase {
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
-        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -81,7 +65,7 @@ class SendOp : public framework::OperatorBase {
    PADDLE_ENFORCE(rpc_client->Wait());
    for (auto& ep : endpoints) {
-      VLOG(2) << "batch barrier, ep: " << ep;
+      VLOG(3) << "batch barrier, ep: " << ep;
      rpc_client->AsyncSendBatchBarrier(ep);
    }
    PADDLE_ENFORCE(rpc_client->Wait());

--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+namespace paddle {
+namespace operators {
+inline bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,34 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <future>
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
 namespace paddle {
 namespace operators {
-static bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
 class SendVarsOp : public framework::OperatorBase {
 public:
@@ -95,7 +78,7 @@ Send operator
 This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    AddAttr<int>("ync_send",
+    AddAttr<int>("sync_send",
                 "(int, default 0)"
                 "sync send or async send.")
        .SetDefault(0);

--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -23,12 +23,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
@@ -37,11 +31,13 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    auto* in = context.Input<LoDTensor>("X");
    auto* out = context.Output<Tensor>("Out");
    std::string pooltype = context.Attr<std::string>("pooltype");
+    Tensor* index = nullptr;
+    if (pooltype == "MAX") {
+      index = context.Output<Tensor>("MaxIndex");
+    }
    auto dims = in->dims();
    auto lod = in->lod();
-    int64_t w = in->numel() / dims[0];
    // InferShape by lod
    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
    PADDLE_ENFORCE_GE(
@@ -50,45 +46,14 @@ class SequencePoolKernel : public framework::OpKernel<T> {
        "The first dimension of Input(X) must be large than batch size.");
    dims[0] = lod[0].size() - 1;
    out->Resize({dims});
-    auto lod_level_0 = lod[0];
    out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
-      auto* index = context.Output<Tensor>("MaxIndex");
      index->Resize({dims});
      index->mutable_data<int>(context.GetPlace());
-      max_pool(dev_ctx, *in, out, index);
-      return;
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-      if (pooltype == "AVERAGE") {
-        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
    }
+    math::SequencePoolFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
+         index);
  }
 };
@@ -96,58 +61,17 @@ template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
    std::string pooltype = context.Attr<std::string>("pooltype");
+    const Tensor* index = nullptr;
-    auto dims = in->dims();
-    auto lod = in->lod()[0];
-    int64_t w = in->numel() / dims[0];
-    in_g->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
    if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
+      index = context.Input<Tensor>("MaxIndex");
-      auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(dev_ctx, *out_g, *index, in_g);
-      return;
-    }
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<DeviceContext, T> functor;
-      functor(dev_ctx, in_g, 0);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t =
-          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(place) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
    }
+    in_g->mutable_data<T>(context.GetPlace());
+    math::SequencePoolGradFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *out_g,
+         in_g, index);
  }
 };

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <thread>
+#include <typeindex>
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace platform {
+inline ncclDataType_t ToNCCLDataType(std::type_index type) {
+  if (type == typeid(float)) {  // NOLINT
+    return ncclFloat;
+  } else if (type == typeid(double)) {  // NOLINT
+    return ncclDouble;
+  } else if (type == typeid(int)) {  // NOLINT
+    return ncclInt;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+class NCCLGroupGuard {
+ public:
+  inline NCCLGroupGuard() {
+    mutex().lock();
+    PADDLE_ENFORCE(dynload::ncclGroupStart());
+  }
+  inline ~NCCLGroupGuard() {
+    PADDLE_ENFORCE(dynload::ncclGroupEnd());
+    mutex().unlock();
+  }
+ private:
+  static std::mutex &mutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
+};
+struct NCCLContext {
+  std::unique_ptr<CUDADeviceContext> ctx_;
+  ncclComm_t comm_;
+  explicit NCCLContext(int dev_id)
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+  cudaStream_t stream() const { return ctx_->stream(); }
+  int device_id() const {
+    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
+  }
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+                              const std::vector<platform::Place> &places) {
+    std::vector<ncclComm_t> comms;
+    std::vector<int> devs;
+    comms.resize(contexts.size());
+    devs.reserve(contexts.size());
+    for (auto &p : places) {
+      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
+    }
+    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+    int i = 0;
+    for (auto &dev_id : devs) {
+      contexts.at(dev_id).comm_ = comms[i++];
+    }
+  }
+};
+struct NCCLContextMap {
+  std::unordered_map<int, NCCLContext> contexts_;
+  std::vector<int> order_;
+  NCCLContextMap(const std::vector<platform::Place> &places) {
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = boost::get<CUDAPlace>(p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, NCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        "NCCL Context Map does not support contain two or more same device");
+    std::vector<ncclComm_t> comms;
+    comms.resize(order_.size());
+    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+  CUDADeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(boost::get<CUDAPlace>(p).device);
+  }
+  const NCCLContext &at(platform::Place p) const {
+    return this->at(boost::get<CUDAPlace>(p).device);
+  }
+  const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
+#include "cuda_runtime.h"
 #include "gtest/gtest.h"
 TEST(Event, CpuElapsedTime) {
@@ -157,3 +158,11 @@ TEST(RecordEvent, RecordEvent) {
  // Will remove parsing-related code from test later
  DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
+TEST(TMP, stream_wait) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+}
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,11 +3,13 @@ if(WITH_PYTHON)
    hip_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
      ${GLOB_OP_LIB})
    if(NOT APPLE AND NOT ANDROID)
      target_link_libraries(paddle_pybind rt)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -496,6 +497,20 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("reset_profiler", platform::ResetProfiler);
+  py::class_<ParallelExecutor>(m, "ParallelExecutor")
+      .def("__init__",
+           [](ParallelExecutor &self, size_t num_threads, bool use_event,
+              const std::vector<platform::Place> &places,
+              const std::unordered_set<std::string> &params,
+              const ProgramDesc &startup_program,
+              const ProgramDesc &main_program, const std::string &loss_var_name,
+              Scope *scope) {
+             new (&self) ParallelExecutor(num_threads, use_event, places,
+                                          params, startup_program, main_program,
+                                          loss_var_name, scope);
+           })
+      .def("run", &ParallelExecutor::Run);
  BindRecordIOWriter(m);
  return m.ptr();
 }

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -53,6 +53,7 @@ function cmake_gen() {
        -DWITH_FAST_BUNDLE_TEST=ON
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -78,6 +79,7 @@ EOF
        -DWITH_TESTING=${WITH_TESTING:-ON} \
        -DWITH_FAST_BUNDLE_TEST=ON \
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ set(PY_FILES paddle/__init__.py
  ${UTILS_PY_FILES}
  ${FLUID_PY_FILES})
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
@@ -62,7 +62,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
    if(WITH_SWIG_PY)
        list(APPEND paddle_python_deps python_api_wheel)
@@ -73,7 +73,7 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
-  if(NOT WITH_FLUID)
+  if(NOT WITH_FLUID_ONLY)
    add_subdirectory(paddle/trainer_config_helpers/tests)
    if (WITH_SWIG_PY)
      # enable v2 API unittest only when paddle swig api is compiled

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -41,6 +41,7 @@ from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
 import unique_name
 import recordio_writer
+from parallel_executor import ParallelExecutor
 Tensor = LoDTensor
@@ -68,6 +69,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
    'profiler',
    'unique_name',
    'recordio_writer',
+    'ParallelExecutor',
 ]

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -25,6 +25,8 @@ __activations__ = [
    'abs',
    'ceil',
    'floor',
+    'cos',
+    'sin',
    'round',
    'reciprocal',
    'log',

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import core
+import multiprocessing
+import framework
+import executor
+__all__ = ['ParallelExecutor']
+class ParallelExecutor(object):
+    def __init__(self, loss_name, use_cuda, num_threads=None):
+        places = []
+        if use_cuda:
+            for i in xrange(core.get_cuda_device_count()):
+                p = core.Place()
+                p.set_place(core.CUDAPlace(i))
+                places.append(p)
+        else:
+            for i in xrange(multiprocessing.cpu_count()):
+                p = core.Place()
+                p.set_place(core.CPUPlace())
+                places.append(p)
+        if num_threads is None:
+            num_threads = min(len(places) * 2, multiprocessing.cpu_count())
+        startup = framework.default_startup_program()
+        main = framework.default_main_program()
+        scope = executor.global_scope()
+        self.executor = core.ParallelExecutor(
+            num_threads,
+            True if use_cuda else False,  # use_event
+            places,
+            set([
+                p.name for p in main.global_block().iter_parameters()
+                if not p.stop_gradient
+            ]),
+            startup.desc,
+            main.desc,
+            loss_name,
+            scope)
+        self.scope = scope
+    def run(self, fetch_list):
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        self.executor.run(fetch_list, fetch_var_name)
+        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+        return [arr[i] for i in range(len(arr))]
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -2,3 +2,5 @@ mnist.recordio
 mnist_0.recordio
 mnist_1.recordio
 mnist_2.recordio
+flowers.recordio
+wmt16.recordio
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -196,6 +196,34 @@ class TestFloor(OpTest):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+class TestCos(OpTest):
+    def setUp(self):
+        self.op_type = "cos"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.cos(self.inputs['X'])}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+class TestSin(OpTest):
+    def setUp(self):
+        self.op_type = "sin"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.sin(self.inputs['X'])}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
 class TestRound(OpTest):
    def setUp(self):
        self.op_type = "round"

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+import paddle.v2.dataset.wmt16 as wmt16
+def simple_fc_net():
+    reader = fluid.layers.open_recordio_file(
+        filename='./mnist.recordio',
+        shapes=[[-1, 784], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'])
+    img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+def fc_with_batchnorm():
+    reader = fluid.layers.open_recordio_file(
+        filename='./mnist.recordio',
+        shapes=[[-1, 784], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'])
+    img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(1):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+        hidden = fluid.layers.batch_norm(input=hidden)
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+    short = shortcut(input, num_filters * 2, stride)
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+def SE_ResNeXt152(batch_size=4):
+    img = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+    conv = conv_bn_layer(
+        input=img, num_filters=64, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+    cardinality = 64
+    reduction_ratio = 16
+    depth = [3, 8, 36, 3]
+    num_filters = [128, 256, 512, 1024]
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+import time
+class TestParallelExecutorBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  method,
+                                  memory_opt=True,
+                                  iter=10,
+                                  batch_size=None):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = method()
+            adam = fluid.optimizer.Adam()
+            adam.minimize(loss)
+            if memory_opt:
+                fluid.memory_optimize(main)
+            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            if batch_size is not None:
+                batch_size *= fluid.core.get_cuda_device_count()
+            begin = time.time()
+            first_loss, = exe.run([loss.name])
+            first_loss = numpy.array(first_loss)
+            for i in xrange(iter):
+                exe.run([])
+            last_loss, = exe.run([loss.name])
+            end = time.time()
+            if batch_size is not None:
+                print "%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin))
+            last_loss = numpy.array(last_loss)
+            print first_loss, last_loss
+            # self.assertGreater(first_loss[0], last_loss[0])
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', reader, feeder)
+    def test_simple_fc(self):
+        self.check_network_convergence(simple_fc_net)
+    def test_batchnorm_fc(self):
+        self.check_network_convergence(fc_with_batchnorm)
+class TestResnet(TestParallelExecutorBase):
+    # @classmethod
+    # def setUpClass(cls):
+    #     # import os
+    #     # if os.path.exists('./flowers.recordio'):
+    #     #     return
+    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
+    #         reader = paddle.batch(flowers.train(), batch_size=4)
+    #         feeder = fluid.DataFeeder(
+    #             feed_list=[
+    #                 fluid.layers.data(
+    #                     name='image', shape=[3, 224, 224]),
+    #                 fluid.layers.data(
+    #                     name='label', shape=[1], dtype='int64'),
+    #             ],
+    #             place=fluid.CPUPlace())
+    #         fluid.recordio_writer.convert_reader_to_recordio_file(
+    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
+    def test_resnet(self):
+        import functools
+        batch_size = 4
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt152, batch_size=batch_size),
+            iter=20,
+            batch_size=batch_size)
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+import numpy as np
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+    def data_to_tensor(data_list, name_list, input_dict, place):
+        assert len(data_list) == len(name_list)
+        for i in range(len(name_list)):
+            tensor = fluid.LoDTensor()
+            tensor.set(data_list[i], place)
+            input_dict[name_list[i]] = tensor
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+import transformer_model
+def transformer():
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+class TestTransformer(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+        with fluid.recordio_writer.create_recordio_writer(
+                "./wmt16.recordio") as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+    @unittest.skip("transformer is buggy in multi gpu")
+    def test_main(self):
+        self.check_network_convergence(transformer)
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -49,6 +49,61 @@ class TestSeqAvgPool(OpTest):
        self.check_grad(["X"], "Out")
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SUM"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
+class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+        self.inputs = {'X': (x, lod)}
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "MAX"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = np.amax(sub_x, axis=0)
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SQRT"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "LAST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "FIRST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
 class TestSeqAvgPool2D(TestSeqAvgPool):
    def set_data(self):
        self.op_type = 'sequence_pool'
@@ -68,14 +123,6 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
            out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
-class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
 class TestSeqSumPool2D(TestSeqAvgPool2D):
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "SUM"}
@@ -84,15 +131,6 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
-class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "SQRT"}
@@ -108,28 +146,6 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
        self.check_grad(["X"], "Out", max_relative_error=0.06)
-class TestSeqMaxPool(TestSeqAvgPool):
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
-        self.inputs = {'X': (x, lod)}
-        out = np.zeros((4, 23)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, lod, out
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = np.amax(sub_x, axis=0)
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
    def set_data(self):
        self.op_type = 'sequence_pool'
@@ -151,14 +167,6 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
-class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[-1, :]
 class TestSeqLastPool2D(TestSeqAvgPool2D):
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "LAST"}
@@ -167,14 +175,6 @@ class TestSeqLastPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(sub_x[-1, :], (3, 17))
-class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[0, :]
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "FIRST"}

--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+batch_size = 64
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_value,
+                          fan_out=n_head * d_value),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        return q, k, v
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+        hidden_size = x.shape[-1]
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        reshaped = layers.reshape(
+            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int,
+                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
+        # The current implementation of softmax_op only supports 2D tensor,
+        # consequently it cannot be directly used here.
+        # If to use the reshape_op, Besides, the shape of product inferred in
+        # compile-time is not the actual shape in run-time. It cann't be used
+        # to set the attribute of reshape_op.
+        # So, here define the softmax for temporary solution.
+        def __softmax(x, eps=1e-9):
+            exp_out = layers.exp(x=x)
+            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
+            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
+        if dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+    out = __combine_heads(ctx_multiheads)
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         param_attr=fluid.initializer.Xavier(uniform=False),
+                         bias_attr=False,
+                         num_flatten_dims=2)
+    return proj_out
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       param_attr=fluid.initializer.Uniform(
+                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
+                       act="relu")
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.initializer.Uniform(
+                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
+    return out
+def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout:
+                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
+    return out
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_pad_idx,
+                    src_max_len,
+                    dropout=0.,
+                    pos_pad_idx=0,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+    This module is used at the bottom of the encoder stacks.
+    """
+    src_word_emb = layers.embedding(
+        src_word,
+        size=[src_vocab_size, src_emb_dim],
+        padding_idx=src_pad_idx,
+        param_attr=fluid.initializer.Normal(0., 1.))
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        padding_idx=pos_pad_idx,
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    enc_input = src_word_emb + src_pos_enc
+    # FIXME(guosheng): Decouple the program desc with batch_size.
+    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
+    return layers.dropout(
+        enc_input, dropout_prob=dropout,
+        is_test=False) if dropout else enc_input
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
+                                       attn_bias, d_key, d_value, d_model,
+                                       n_head, dropout_rate)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
+                                   d_model, d_inner_hid, dropout_rate)
+        enc_input = enc_output
+    return enc_output
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """ The layer to be stacked in decoder part.
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate, )
+        dec_input = dec_output
+    return dec_output
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        src_pad_idx,
+        trg_pad_idx,
+        pos_pad_idx, ):
+    file_obj = fluid.layers.open_recordio_file(
+        filename='./wmt16.recordio',
+        shapes=[
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+        ],
+        dtypes=[
+            'int64',
+            'int64',
+            'int64',
+            'int64',
+            'float32',
+            'float32',
+            'float32',
+            'int64',
+            'float32',
+        ],
+        lod_levels=[0] * 9)
+    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
+        file_obj)
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        src_pad_idx,
+        max_length,
+        dropout_rate, )
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        trg_pad_idx,
+        max_length,
+        dropout_rate, )
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+    # TODO(guosheng): Share the weight matrix between the embedding layers and
+    # the pre-softmax linear transformation.
+    predict = layers.reshape(
+        x=layers.fc(input=dec_output,
+                    size=trg_vocab_size,
+                    param_attr=fluid.initializer.Xavier(uniform=False),
+                    bias_attr=False,
+                    num_flatten_dims=2),
+        shape=[-1, trg_vocab_size],
+        act="softmax")
+    cost = layers.cross_entropy(input=predict, label=gold)
+    weighted_cost = cost * weights
+    return layers.reduce_sum(weighted_cost)
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -68,7 +68,7 @@ packages=['paddle',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.layers']
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
    packages+=['paddle.proto',
               'paddle.trainer',
               'paddle.trainer_config_helpers',
@@ -87,7 +87,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
    paddle_bin_dir = 'opt/paddle/bin'
    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                   '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
@@ -95,7 +95,7 @@ if '${WITH_FLUID}'== 'OFF':
                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['core.so']}
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
    package_data['paddle.v2.master']=['libpaddle_master.so']
    package_data['py_paddle']=['*.py','_swig_paddle.so']
@@ -106,7 +106,7 @@ package_dir={
    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
 }
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'