diff --git a/CMakeLists.txt b/CMakeLists.txt
index de47086dbd6a440cd413c7843c83b1c69d9841b2..23bbe829ac16180088bfa37df66e23f19b021ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,6 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_TENSORRT    "Compile PaddlePaddle with TensorRT support."   OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -180,13 +179,9 @@ set(EXTERNAL_LIBS
 
 if(WITH_GPU)
     include(cuda)
+    include(tensorrt)
 endif(WITH_GPU)
 
-# TensorRT depends on GPU.
-if (NOT WITH_GPU)
-  set(WITH_TENSORRT OFF)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/Dockerfile b/Dockerfile
index 9097bb657d2366997112ec7662762a93358aa647..870304a6acc99e715dffbfabd8058be000b6872c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -46,7 +46,7 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 RUN curl -s -q https://glide.sh/get | sh
 
 # Install TensorRT
-# The unnecessary files has been removed to make the library small.
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b4b74dd1e4f4d0901dd81c91a7decd1..848a7eba6f1421432addae8acff407b611adb4ae 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
diff --git a/paddle/scripts/check_env.sh b/benchmark/paddle/image/check_env.sh
similarity index 100%
rename from paddle/scripts/check_env.sh
rename to benchmark/paddle/image/check_env.sh
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f726405c4773994f6ca6509e5218750805b03995..e490397cc0624c310949a4b571bd00cac6e8953b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -80,6 +80,16 @@ if(WITH_GPU)
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+    if(TENSORRT_FOUND)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        include_directories(${TENSORRT_INCLUDE_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0c07d36bed65400164853b99f18ec0335341cd94
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+    set(TENSORRT_FOUND ON)
+else()
+    set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+        "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index ee69925fda6b3fc850cfb632e8edd359e7fcff9c..f186c9c85a640da49d95a1a62c721b09b3007d83 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,3 +33,45 @@ Xavier
     :members:
     :noindex:
 
+MSRA
+------
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+    MSRA
+    ------
+
+MSRAInitializer
+-----------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 5c02886efd7d11e9520910526fb90ec01e123bae..3790f09c84563fe541bd8d0bc08e23b19d4287ca 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -815,3 +815,8 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
index 9aed3b059a1595ba3971d7d5acfc0d16a731584b..4f88e27bed722e9f2f535e368926fe49b4e72e56 100644
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
 
 ## Synchronize GPU Kernels
 
-The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
 
 1. `OpHandle` will record `DeviceContext` that it is used.
 2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+![synchronous distributed training](./src/sync_distributed_training.png)
+
+1. Pserver
+	1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+	1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+	1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+	1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+	1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+	1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+   1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+	1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+	1. Pserver will run the optimization block to optimize the parameters.
+	1. After optimization, pserver set `barrier_condition_` to 1.
+	1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+	1. The trainer uses GetVariable to get all the parameters from pserver.
+	1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+	1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+![asynchronous distributed training](./src/async_distributed_training.png)
+
+1. Pserver:
+	1. Each parameter has a queue to receive its gradient from trainers.
+	1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+	1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+	1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+	1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+	2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a0835b761b69030ba30697e6e8863928efbf57f
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,58 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
+1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+so all the Trainers would get the same parameters.
+
+In the synchronously distributed training, there should be a `Barrier` to synchronise the
+parameters after the optimizing stage. The performance of a distributed training job would
+depend on the slowest node if there were hundreds or thousands of training nodes in a
+Job, the performance of synchronously distributed training might be very poor because of
+the slow node. So this design doc would introduce an approach to implement
+*asynchronously* distributed training in PaddlePaddle Fluid.
+
+## Design
+
+<img src="./src/async_update.png" width="600"/>
+
+As the figure above, we describe a global view of asynchronously update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PServer
+instances and then send them.
+1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PServer instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+<img src="./src/async_pserver.png" width="750"/>
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst
index 2ccfec9c284877a7576e9751526b169a4ac78d8e..d7c896a6270b488ca4449e5211d0d0879eda6ac5 100644
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@@ -6,7 +6,43 @@ Data Reader Interface
 DataTypes
 =========
 
-..  automodule:: paddle.v2.data_type
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
     :members:
     :noindex:
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1f3ca24df16cf080d325fbdc0d613a828e384b2a..340b891e41671df7e61a4a66ec538d4603bb9842 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index dfc52b012f8b6bf5cf1a3feab90dc1ec7842ad6c..bcd61335be0f7fe64563ee65daaf9de0760c9b1a 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -77,14 +77,9 @@ struct TestBroadcastOpHandle {
     local_scopes_[input_scope_idx]->Var("input");
 
     op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
-
-    vars_.emplace_back(new VarHandle());
-    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    in_var_handle->place_ = gpu_list_[input_scope_idx];
-    in_var_handle->name_ = "input";
-    in_var_handle->version_ = 1;
-    in_var_handle->scope_idx_ = input_scope_idx;
-    in_var_handle->generated_op_ = nullptr;
+    auto* in_var_handle =
+        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
     op_handle_->AddInput(in_var_handle);
 
     // add dummy var
@@ -96,12 +91,8 @@ struct TestBroadcastOpHandle {
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      out_var_handle->place_ = gpu_list_[j];
-      out_var_handle->name_ = "out";
-      out_var_handle->version_ = 2;
-      out_var_handle->scope_idx_ = j;
+      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
     }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 10839f239d59e97946575297a6d125968a1458f4..2da8c89d2df73215b748f102d9bbfc5b742cf97f 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -79,13 +79,8 @@ struct TestGatherOpHandle {
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
-      vars_.emplace_back(new VarHandle());
-      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
-      in_var_handle->place_ = gpu_list_[j];
-      in_var_handle->name_ = "input";
-      in_var_handle->version_ = 1;
-      in_var_handle->scope_idx_ = j;
-      in_var_handle->generated_op_ = nullptr;
+      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
 
@@ -97,12 +92,9 @@ struct TestGatherOpHandle {
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    vars_.emplace_back(new VarHandle());
-    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
-    out_var_handle->place_ = gpu_list_[input_scope_idx];
-    out_var_handle->name_ = "out";
-    out_var_handle->version_ = 2;
-    out_var_handle->scope_idx_ = input_scope_idx;
+    auto* out_var_handle =
+        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
     // add dummy var
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 5a95cbc53625888bac539f91af391ff0babec17b..d2b6a35a5d5c260b023c68ec4684da95a5b79e81 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -89,105 +89,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
-    if (!is_forwarding) {
-      // FIXME(yy): Do not hard code like this
-      if (op->OutputArgumentNames().size() == 1 &&
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
-        continue;  // Drop fill 1. for backward coeff;
-      }
-    }
-
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
-    }
-
-    for (size_t i = 0; i < places_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
-      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, *op, p, i);
-
-      auto var_names = op->OutputArgumentNames();
-
-      if (is_forwarding) {
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
-      }
-    }
-
-    if (change_forward) {
+    if (op->Type() == "send") {
+      // append send op if program is distributed trainer main program.
+      // always use the first device
+      CreateSendOp(&result, *op);
+    } else if (IsScaleLossOp(*op)) {
+      CreateScaleLossGradOp(&result);
       is_forwarding = false;
-    }
-
-    if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once. But there are no
-      // other cases, for example, we need to adjust the gradient according to
-      // the input when we get the gradient, which is not considered at present.
-      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0 &&
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-
-            vars.emplace_back(new VarHandle);
-            auto &var = vars.back();
-            var->place_ = p;
-            var->name_ = og;
-            var->version_ = vars.size() - 1;
-
-            op_handle->AddOutput(var.get());
+    } else {
+      CreateComputationalOps(&result, *op);
+      if (!is_forwarding) {
+        // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
+        // present.
+        for (auto &og : op->OutputArgumentNames()) {
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
+            InsertNCCLAllReduceOp(&result, og);
           }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
         }
       }
     }
@@ -211,7 +131,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
 
   return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op) const {
+  for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index f1518d75b421006db6311c3b0f602e47000ab381..b5ba2dbd3c00f23fabd993d7908664db38a31941 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
+
+  bool IsScaleLossOp(const OpDesc &op) const;
+
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 1e48f75958a3ada4d1cd5c8d0f920da4fed2157e..e587210b357ea6caa3272903d8aa6b3e4b2e8228 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
       lod_tensors.emplace_back(lod_tensor);
     }
 
@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
         }
       });
     } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg =
-          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+      auto &trg = *this->local_scopes_[0]
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->Var()
+                       ->GetMutable<framework::LoDTensor>();
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
       VisitDataType(ToDataType(lod_tensors[0].type()), func);
 
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &scope = local_scopes_[i];
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope->FindVar(var_name);
+        auto *var = scope.FindVar(var_name);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 7fb9f99a8a1bc044e2f25f373265a5ec9f7d76d5..7a65ee62c9bfc0dad2ebee3be21de825fa405d73 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 void ScaleLossGradOpHandle::RunImpl() {
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp =
-      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
-          make_ddim({1}), place_);
+  float *tmp = local_scope.FindVar(var_name)
+                   ->GetMutable<LoDTensor>()
+                   ->mutable_data<float>(make_ddim({1}), place_);
 
   if (platform::is_cpu_place(place_)) {
     *tmp = coeff_;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index be5fb7577581fd99b1b7b80ccdd2acb8d3a91f01..25e8c77bb489546092b2a93e052da7dd0dd5edf4 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -54,13 +54,8 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
   auto &var_holder = var_holders[each_var_name];
   VarHandle *var = nullptr;
   if (var_holder.empty()) {
-    var_holder.emplace_back(new VarHandle);
-    auto &init_var = var_holder[0];
-    init_var->place_ = place;
-    init_var->name_ = each_var_name;
-    init_var->generated_op_ = nullptr;
-    init_var->version_ = 0;
-    var = init_var.get();
+    var = new VarHandle(0, place_offset, each_var_name, place);
+    var_holder.emplace_back(var);
   } else {
     var = var_holder.rbegin()->get();
   }
@@ -73,12 +68,9 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
                                      size_t place_offset) {
   auto &vars = graph->vars_[place_offset][each_var_name];
   size_t version = vars.size();
-  vars.emplace_back(new VarHandle());
-  auto &var = vars.back();
-  var->version_ = version;
-  var->name_ = each_var_name;
-  var->place_ = place;
-  op_handle->AddOutput(var.get());
+  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
 }
 
 template <typename Callback>
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index a371ee10fe03cda86c316f3503f9cadb8c716ae5..3d2bd633afff1d453d00faeca3b3dcf77f8dd5d7 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       running_ops_(0),
       allow_op_delay_(allow_op_delay) {}
 
-void ThreadedSSAGraphExecutor::RunDelayedOps(
-    const std::unordered_set<OpHandleBase *> &delayed_ops) {
-  for (auto op : delayed_ops) {
-    op->Run(use_event_);
-  }
-}
-
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
-  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
-  std::unordered_set<VarHandleBase *> delayed_vars;
 
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
     pending_vars.insert(&var);
@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     InsertPendingOp(*op);
   }
 
-  auto run_all_ready_ops = [&] {
-    for (auto *op : ready_ops) {
-      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
-        delayed_ops.insert(op);
-        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
-        ready_vars.Extend(op->outputs_);
-        continue;
-      }
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
       running_ops_++;
       RunOp(&ready_vars, op);
     }
-    ready_ops.clear();
+    set.clear();
   };
 
   // Step 3. Execution
-  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
+  while (!pending_vars.empty()) {
     // 1. Run All Ready ops
-    run_all_ready_ops();
+    // Keep loop until all vars are ready.
+    //
+    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
+    // ready_ops have been performed.
+    if (ready_ops.empty() && allow_op_delay_) {
+      run_all_ops(delayed_ops);
+    } else {
+      run_all_ops(ready_ops);
+    }
 
     // 2. Find ready variable
     bool timeout;
@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
-            blocked_by_delayed_ops.insert(op);
+          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+            delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
           }
         }
       }
     }
-    // When there are no other ops to schedule, schedule buffered delayed
-    // ops and unblock other ops.
-    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
-      RunDelayedOps(delayed_ops);
-      delayed_ops.clear();
-      for (auto *op : blocked_by_delayed_ops) {
-        ready_ops.insert(op);
-      }
-      blocked_by_delayed_ops.clear();
-    }
-    // Keep loop until all vars are ready.
   }
   PADDLE_ENFORCE(ready_ops.empty());
-  PADDLE_ENFORCE(delayed_ops.empty());
-  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bb5e837b135c35b5aea403496b45aab1ccc288ff..d70bbd4ef0eb02d1b473bf88e526996819aec5f9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
              details::OpHandleBase *op);
 
-  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
-
  private:
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 871e41343f53b801a22d3a450f0906f37fb372d1..2b887c67e6fc6ea78e42fbb9fd170f740db27d97 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -16,6 +16,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_set>
+#include <utility>
 
 #include "paddle/fluid/platform/place.h"
 
@@ -33,10 +34,10 @@ struct VarHandleBase {
 
   // The operator who generate this variable. nullptr if the variable
   // is a root node.
-  OpHandleBase *generated_op_;
+  OpHandleBase* generated_op_{nullptr};
 
   // Operators which depend on this variable ready.
-  std::unordered_set<OpHandleBase *> pending_ops_;
+  std::unordered_set<OpHandleBase*> pending_ops_;
 };
 
 // VarHandle is actually a single version of Runtime Variable.
@@ -47,6 +48,13 @@ struct VarHandleBase {
 struct VarHandle : public VarHandleBase {
   std::string DebugString() const override;
 
+  VarHandle(size_t version, size_t scope_index, std::string name,
+            platform::Place place)
+      : version_(version),
+        scope_idx_(scope_index),
+        name_(std::move(name)),
+        place_(std::move(place)) {}
+
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c1486b527d2e06d2b3f7e0f89458bf9a22564586..106b5f866ed5225d67082310e308984d8b3f19ed 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -63,13 +63,14 @@ ParallelExecutor::ParallelExecutor(
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(&scope->NewScope());
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(local_scopes[i]);
     }
   }
 
@@ -155,15 +156,13 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
 
-void ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors,
-    const std::string &fetched_var_name,
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  SplitTensorToPlaces(feed_tensors);
-
   // Create local scopes.
-  for (auto &scope : member_->local_scopes_) {
+  for (auto it = member_->local_scopes_.rbegin();
+       it != member_->local_scopes_.rend(); ++it) {
+    auto &scope = *it;
     Scope &local_scope = scope->NewScope();
     *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
         &local_scope;
@@ -177,7 +176,7 @@ void ParallelExecutor::Run(
         InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       } else {
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
                            std::get<1>(name_type_pair));
       }
     }
@@ -195,14 +194,28 @@ void ParallelExecutor::Run(
     auto &local_scope =
         *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
     scope->DeleteScope(local_scope);
-    local_scope = nullptr;
   }
 }
 
-void ParallelExecutor::SplitTensorToPlaces(
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
-  for (auto it : feed_tensors) {
-    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+void ParallelExecutor::FeedTensorsIntoLocalScopes(
+    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &map = tensors[i];
+    auto *scope = member_->local_scopes_[i];
+    for (auto &pair : map) {
+      auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
+      trg->ShareDataWith(pair.second);
+      trg->set_lod(pair.second.lod());
+    }
+  }
+}
+
+void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
+    const std::unordered_map<std::string, LoDTensor> &tensors) {
+  for (auto pair : tensors) {
+    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
     PADDLE_ENFORCE_EQ(
         member_->places_.size(), lod_tensors.size(),
         "The number of samples of current batch is less than the count of "
@@ -211,7 +224,7 @@ void ParallelExecutor::SplitTensorToPlaces(
     for (size_t j = 0; j < member_->places_.size(); ++j) {
       // TODO(panxy0718): Do I need to delete this var?
       auto t =
-          member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
+          member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
       t->ShareDataWith(lod_tensors[j]);
       t->set_lod(lod_tensors[j].lod());
     }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b4f16dba858fb279ec23a8a04257dda6651148cc..303ac3bc55cfed57a03765b27d8aba581eabd1c8 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -44,16 +44,22 @@ class ParallelExecutor {
 
   std::vector<Scope*>& GetLocalScopes();
 
+  /**
+   * Feed tensors to local scopes. The size of tensors should be equal to the
+   * size of local scopes.
+   */
+  void FeedTensorsIntoLocalScopes(
+      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+
+  void FeedAndSplitTensorIntoLocalScopes(
+      const std::unordered_map<std::string, LoDTensor>& tensors);
+
   void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name,
-           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+           const std::string& fetched_var_name);
 
   void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 
  private:
-  void SplitTensorToPlaces(
-      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
-
   ParallelExecutorPrivate* member_;
 };
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 8494edee6c2c714c285c45bbb4fe1d8cb1a524aa..cc45bfe9b17d767be039cc0d8d83234b6994d6c1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
-  if (WITH_TENSORRT)
+  if (TENSORRT_FOUND)
     add_subdirectory(tensorrt)
   endif()
 endif()
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 718f469d38c3c6b7272c1531fae0a1e9ad2e8e3e..4a8dfd4b54227070c2143b180f8ab92753885550 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <string>
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3cc6ed310575473fae8e91a8507fb9146107e841..4cb0457d9285e20d4b6a2f9987b7fdb1c6ac157f 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -87,7 +88,7 @@ struct BeamSearchDecoder {
    */
   std::vector<BeamNodeVector<T>> PackTwoSteps(
       const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<BeamNodeVector<T>>* prefixes_list,
       std::vector<SentenceVector<T>>* sentence_vector_list) const;
 
   /**
@@ -140,7 +141,7 @@ Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
 template <typename T>
 std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
     const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<BeamNodeVector<T>>* prefixes_list,
     std::vector<SentenceVector<T>>* sentence_vector_list) const {
   std::vector<BeamNodeVector<T>> result;
 
@@ -153,7 +154,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
 
     // if prefixes size is 0, it means this is the first step. In this step,
     // all candidate id is the start of candidate sentences.
-    if (prefixes_list.empty()) {
+    if (prefixes_list->empty()) {
       PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
                         cur_ids.lod().at(kSentenceLevel).back(),
                         "in the first step");
@@ -162,7 +163,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
             cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
       }
     } else {
-      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
       SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
 
       PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
@@ -262,7 +263,7 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
   for (size_t step_id = 0; step_id < step_num; ++step_id) {
     beamnode_vector_list =
         PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     beamnode_vector_list, &sentence_vector_list);
+                     &beamnode_vector_list, &sentence_vector_list);
   }
   // append last beam_node to result
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index c3faf46e09bb40d01049fd9cfd79836c1d2bd5bb..36f9594969c416c694928811012baf94332bbd91 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -125,7 +125,7 @@ TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
 
   BeamSearchDecoder<float> helper;
   beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
   ASSERT_EQ(beamnode_vector_list.size(), 2UL);
   ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
   ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
@@ -167,7 +167,7 @@ TEST(BeamSearchDecodeOp, PackTwoSteps) {
 
   BeamSearchDecoder<float> helper1;
   beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
 
   ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
   ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e848b1f12cb9f1ce1d37e0e0233bfc361dc35a33..fdab4e92f47c7c8f241d93268a73dcb8c2eb2dc6 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -14,7 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_op.h"
 
+#include <algorithm>
 #include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b333ef4e6c73be15dfea2cadb153d2484b3daaf7..0a481a85ce6fbb582b8c0e12710455aaaac72aa1 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #endif
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 77d3cffe7c19affe66223363eba26e2d77cdcd43..95440ff89e883e754795c67cd58a08f1131df368 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/chunk_eval_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index 9e97f7c7762ed6bded94be35ae8a094466e0aec0..8631415062db839476e2536a9836e4b9f069a3e2 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <set>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   };
 
   void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>& segments, int num_chunk_types,
+                   std::vector<Segment>* segments, int num_chunk_types,
                    int num_tag_types, int other_chunk_type, int tag_begin,
                    int tag_inside, int tag_end, int tag_single) const {
-    segments.clear();
-    segments.reserve(length);
+    segments->clear();
+    segments->reserve(length);
     int chunk_start = 0;
     bool in_chunk = false;
     int tag = -1;
@@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
             i - 1,        // end
             prev_type,
         };
-        segments.push_back(segment);
+        segments->push_back(segment);
         in_chunk = false;
       }
       if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
@@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
           length - 1,   // end
           type,
       };
-      segments.push_back(segment);
+      segments->push_back(segment);
     }
   }
 
@@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, *num_infer_chunks_data,
-                 *num_label_chunks_data, *num_correct_chunks_data,
+                 &output_segments, &label_segments, num_infer_chunks_data,
+                 num_label_chunks_data, num_correct_chunks_data,
                  num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
                  tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
@@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   }
 
   void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>& output_segments,
-                  std::vector<Segment>& label_segments,
-                  int64_t& num_output_segments, int64_t& num_label_segments,
-                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  std::vector<Segment>* output_segments,
+                  std::vector<Segment>* label_segments,
+                  int64_t* num_output_segments, int64_t* num_label_segments,
+                  int64_t* num_correct, int num_chunk_types, int num_tag_types,
                   int other_chunk_type, int tag_begin, int tag_inside,
                   int tag_end, int tag_single,
                   const std::set<int>& excluded_chunk_types) const {
@@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
                 other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
     size_t i = 0, j = 0;
-    while (i < output_segments.size() && j < label_segments.size()) {
-      if (output_segments[i] == label_segments[j] &&
-          excluded_chunk_types.count(output_segments[i].type) != 1) {
-        ++num_correct;
+    while (i < output_segments->size() && j < label_segments->size()) {
+      if (output_segments->at(i) == label_segments->at(j) &&
+          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
+        ++(*num_correct);
       }
-      if (output_segments[i].end < label_segments[j].end) {
+      if (output_segments->at(i).end < label_segments->at(j).end) {
         ++i;
-      } else if (output_segments[i].end > label_segments[j].end) {
+      } else if (output_segments->at(i).end > label_segments->at(j).end) {
         ++j;
       } else {
         ++i;
         ++j;
       }
     }
-    for (auto& segment : label_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    for (auto& segment : (*label_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_label_segments);
+      }
     }
-    for (auto& segment : output_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    for (auto& segment : (*output_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_output_segments);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 0a8a5d4c71c4510f04eea2f7ef12f836d1fd9c9b..63d371310d2a26a1460e527fc51923dfd6e0b8bc 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -73,9 +73,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
     auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+        mkldnn::memory({weights_md, mkldnn_engine},
+                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
     auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
 
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -180,8 +182,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
 
     // create memory
-    auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
-                                          (void*)output_grad_data);
+    auto diff_dst_memory = mkldnn::memory(
+        {diff_weights_md, mkldnn_engine},
+        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
     // Retrieve conv_pd from device context
     auto conv_pd =
         std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
@@ -198,10 +201,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                       mkldnn_engine);
 
       // create memory
-      auto diff_weights_memory = mkldnn::memory(
-          {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto diff_weights_memory =
+          mkldnn::memory({diff_weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(filter_grad_data));
       auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+          mkldnn::memory({src_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(input_data)));
 
       // create backward conv primitive for weights
       auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
@@ -220,10 +225,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                    strides, paddings, *conv_pd, mkldnn_engine);
 
       // create memory
-      auto diff_src_memory =
-          mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto diff_src_memory = mkldnn::memory(
+          {diff_src_md, mkldnn_engine},
+          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
       auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+          mkldnn::memory({weights_md, mkldnn_engine},
+                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));
 
       // create backward conv primitive for data
       auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 12b45f1d65019f623268cb9da9004bac5e1f72a3..d6f86a5c88e37970379da0afe2a1d46e18b653f4 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
@@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
 
   return output_size;
 }
-inline bool IsExpand(std::vector<int64_t>& filter_dim,
-                     std::vector<int>& strides, std::vector<int>& paddings,
-                     std::vector<int>& dilations) {
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
   bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
   for (size_t j = 0; j < strides.size(); ++j) {
     filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 93ef15b9332168a9c62abfd4d0827207173ece45..38f43b6d031372948bd82c686a2d9ce5f8ecd07c 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection_map_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 8c15bfa36bfe72586cfcbdbd8efc4542253adaca..431812e2bfcf926cadf8d7be6a7d1a79e78c7762 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -82,7 +87,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     std::vector<std::map<int, std::vector<Box>>> gt_boxes;
     std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
 
-    GetBoxes(*in_label, *in_detect, gt_boxes, detect_boxes);
+    GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
 
     std::map<int, int> label_pos_count;
     std::map<int, std::vector<std::pair<T, int>>> true_pos;
@@ -95,20 +100,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     }
 
     if (in_pos_count != nullptr && state) {
-      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
-                  true_pos, false_pos, class_num);
+      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count,
+                  &true_pos, &false_pos, class_num);
     }
 
     CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
-                             overlap_threshold, label_pos_count, true_pos,
-                             false_pos);
+                             overlap_threshold, &label_pos_count, &true_pos,
+                             &false_pos);
 
     int background_label = ctx.Attr<int>("background_label");
     T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
                     background_label);
 
-    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
-                 *out_true_pos, *out_false_pos, class_num);
+    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count,
+                 out_true_pos, out_false_pos, class_num);
 
     T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
     map_data[0] = map;
@@ -155,7 +160,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
 
   void GetBoxes(const framework::LoDTensor& input_label,
                 const framework::LoDTensor& input_detect,
-                std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
+                std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
                 std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
                     detect_boxes) const {
     auto labels = framework::EigenTensor<T, 2>::From(input_label);
@@ -179,7 +184,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           box.is_difficult = true;
         boxes[label].push_back(box);
       }
-      gt_boxes.push_back(boxes);
+      gt_boxes->push_back(boxes);
     }
 
     auto detect_index = detect_lod[0];
@@ -200,9 +205,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::map<int, int>& label_pos_count,
       const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
       const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      framework::Tensor& output_pos_count,
-      framework::LoDTensor& output_true_pos,
-      framework::LoDTensor& output_false_pos, const int class_num) const {
+      framework::Tensor* output_pos_count,
+      framework::LoDTensor* output_true_pos,
+      framework::LoDTensor* output_false_pos, const int class_num) const {
     int true_pos_count = 0;
     int false_pos_count = 0;
     for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
@@ -214,12 +219,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       false_pos_count += fp.size();
     }
 
-    int* pos_count_data = output_pos_count.mutable_data<int>(
+    int* pos_count_data = output_pos_count->mutable_data<int>(
         framework::make_ddim({class_num, 1}), ctx.GetPlace());
 
-    T* true_pos_data = output_true_pos.mutable_data<T>(
+    T* true_pos_data = output_true_pos->mutable_data<T>(
         framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
-    T* false_pos_data = output_false_pos.mutable_data<T>(
+    T* false_pos_data = output_false_pos->mutable_data<T>(
         framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
     true_pos_count = 0;
     false_pos_count = 0;
@@ -261,21 +266,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     framework::LoD false_pos_lod;
     false_pos_lod.emplace_back(false_pos_starts);
 
-    output_true_pos.set_lod(true_pos_lod);
-    output_false_pos.set_lod(false_pos_lod);
+    output_true_pos->set_lod(true_pos_lod);
+    output_false_pos->set_lod(false_pos_lod);
     return;
   }
 
   void GetInputPos(const framework::Tensor& input_pos_count,
                    const framework::LoDTensor& input_true_pos,
                    const framework::LoDTensor& input_false_pos,
-                   std::map<int, int>& label_pos_count,
-                   std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-                   std::map<int, std::vector<std::pair<T, int>>>& false_pos,
+                   std::map<int, int>* label_pos_count,
+                   std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+                   std::map<int, std::vector<std::pair<T, int>>>* false_pos,
                    const int class_num) const {
     const int* pos_count_data = input_pos_count.data<int>();
     for (int i = 0; i < class_num; ++i) {
-      label_pos_count[i] = pos_count_data[i];
+      (*label_pos_count)[i] = pos_count_data[i];
     }
 
     auto SetData = [](const framework::LoDTensor& pos_tensor,
@@ -291,8 +296,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       }
     };
 
-    SetData(input_true_pos, true_pos);
-    SetData(input_false_pos, false_pos);
+    SetData(input_true_pos, *true_pos);
+    SetData(input_false_pos, *false_pos);
     return;
   }
 
@@ -301,9 +306,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
           detect_boxes,
       bool evaluate_difficult, float overlap_threshold,
-      std::map<int, int>& label_pos_count,
-      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+      std::map<int, int>* label_pos_count,
+      std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
     int batch_size = gt_boxes.size();
     for (int n = 0; n < batch_size; ++n) {
       auto image_gt_boxes = gt_boxes[n];
@@ -320,10 +325,10 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           continue;
         }
         int label = it->first;
-        if (label_pos_count.find(label) == label_pos_count.end()) {
-          label_pos_count[label] = count;
+        if (label_pos_count->find(label) == label_pos_count->end()) {
+          (*label_pos_count)[label] = count;
         } else {
-          label_pos_count[label] += count;
+          (*label_pos_count)[label] += count;
         }
       }
     }
@@ -338,8 +343,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           int label = it->first;
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
         continue;
@@ -351,8 +356,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
         if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
           continue;
         }
@@ -381,17 +386,17 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
                 (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
             if (match_evaluate_difficult) {
               if (!visited[max_idx]) {
-                true_pos[label].push_back(std::make_pair(score, 1));
-                false_pos[label].push_back(std::make_pair(score, 0));
+                (*true_pos)[label].push_back(std::make_pair(score, 1));
+                (*false_pos)[label].push_back(std::make_pair(score, 0));
                 visited[max_idx] = true;
               } else {
-                true_pos[label].push_back(std::make_pair(score, 0));
-                false_pos[label].push_back(std::make_pair(score, 1));
+                (*true_pos)[label].push_back(std::make_pair(score, 0));
+                (*false_pos)[label].push_back(std::make_pair(score, 1));
               }
             }
           } else {
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
       }
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 3b89ad5d49c339cf05abc0f8577e895f30dddfd4..913a9145420dae7c4f6a4df10c0330636b5796b0 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 5d293665f0bcc098126ad3ec6c9bf34ff54c3b6f..a4c925b538ef916e88ec06cea6de57f31eaf069b 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <fstream>
 #include <ostream>
 #include <thread>  // NOLINT
 #include <vector>
@@ -67,7 +68,7 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
-int ListenAndServOp::GetSelectedPort() {
+int ListenAndServOp::GetSelectedPort() const {
   return rpc_service_->GetSelectedPort();
 }
 
@@ -99,7 +100,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    if (blkid != prefetch_block->ID()) {
+    if (blkid != static_cast<size_t>(prefetch_block->ID())) {
       block_list.push_back(blkid);
     }
   }
@@ -121,10 +122,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   rpc_service_->SetProgram(program);
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  // FIXME(typhoonzero): do we need to wait until the server port is ready?
+  VLOG(3) << "wait server thread to become ready...";
   sleep(5);
+  // Write to a file of server selected port for python use.
+  std::ofstream port_file;
+  port_file.open("/tmp/paddle.selected_port");
+  port_file << rpc_service_->GetSelectedPort();
+  port_file.close();
 
-  // TODO(typhoonzero): change this to a while_op for every cluster-batch.
   bool exit_flag = false;
   // Record received sparse variables, so that
   // we could reset those after execute optimize program
@@ -175,7 +180,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     parallel_blkids.push_back(1);
     double ts = detail::GetTimestamp();
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (blkid != prefetch_block->ID()) {
+      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
         if (program->Block(blkid).Parent() != last_parent_blkid) {
           ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
                                 program, &recv_scope);
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 759b2a462ba5b938991aa86be9b9dc3e59fe3f7e..9744921cef7c0f13c94b7fe729561de8e181650c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -39,7 +39,7 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs);
 
-  int GetSelectedPort();
+  int GetSelectedPort() const;
 
   void Stop() override;
 
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 3bf5d57809019d3ae469471c2ee2e7aac70b9faf..a342874f97460cf624ff0047915d33ba4161f19b 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -139,7 +139,6 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"PrefetchBlock", prefetch_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  LOG(INFO) << "selected port before run " << selected_port;
   listen_and_serv_op->Run(scope, place);
   LOG(INFO) << "server exit";
 }
@@ -158,16 +157,13 @@ TEST(SendRecvOp, CPUDense) {
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
                       listen_and_serv_op.get())
                       ->GetSelectedPort();
-  LOG(INFO) << "selected port " << selected_port;
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
   auto send_op = f::OpRegistry::CreateOp(
       "send", {{"X", {"x1"}}},
       {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
-  LOG(INFO) << "before run " << endpoint;
   send_op->Run(scope, place);
-  LOG(INFO) << "end run";
 
   auto in_var = scope.Var("x1");
   auto tensor = in_var->GetMutable<f::LoDTensor>();
@@ -180,7 +176,6 @@ TEST(SendRecvOp, CPUDense) {
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
-  LOG(INFO) << "before stop";
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset(nullptr);
@@ -199,7 +194,6 @@ TEST(SendRecvOp, CPUSparse) {
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
                       listen_and_serv_op.get())
                       ->GetSelectedPort();
-  LOG(INFO) << "selected port " << selected_port;
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index dc2f1763446b2aaf72b20c72e8e37ec920abd120..d00bd1447e6114b6000b65799abb566a2a510127 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -73,6 +73,15 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                    softmax_dst_memory);
     std::vector<primitive> pipeline{softmax};
     stream(stream::kind::eager).submit(pipeline).wait();
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    if (!is_test) {
+      T threshold = exp(-64);
+      for (size_t i = 0; i < dst_tz[0] * dst_tz[1]; ++i) {
+        output_data[i] =
+            output_data[i] < threshold ? threshold : output_data[i];
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 6bdefc0f23910c90f3878d8f2634ca6e03c6f736..e1f286f9ba42ff22fffbfc012832dd751a37c1d0 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -97,6 +97,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Softmax Operator.
 
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 9f8482adedb4c29e32d4109941a2752d942ae49f..d44eeae8e6ff9ac87ab093d04e3f5427743f0c08 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
     // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<LoDTensor>("Out");
-    auto* indices = ctx.Output<LoDTensor>("Indices");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
     // k is determined by Attr
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index b93b925a72a55442c105e4280a3580f4ea5b93a1..364c4901b297dbd647faae85b01f682a1daace9c 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
-if (WITH_TENSORRT)
+if (TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index ca9ab2c7aecff47924f0198802d710b7661f5576..0013597fd516d15c7d502370eec77e1a6a5dca88 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
 
 class NCCLGroupGuard {
  public:
+  static std::mutex &NCCLMutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
+
   inline NCCLGroupGuard() {
-    mutex().lock();
+    NCCLMutex().lock();
     PADDLE_ENFORCE(dynload::ncclGroupStart());
   }
 
   inline ~NCCLGroupGuard() {
     PADDLE_ENFORCE(dynload::ncclGroupEnd());
-    mutex().unlock();
-  }
-
- private:
-  static std::mutex &mutex() {
-    static std::mutex mtx;
-    return mtx;
+    NCCLMutex().unlock();
   }
 };
 
@@ -68,26 +67,6 @@ struct NCCLContext {
   int device_id() const {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
   }
-
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
-                              const std::vector<platform::Place> &places) {
-    std::vector<ncclComm_t> comms;
-    std::vector<int> devs;
-    comms.resize(contexts->size());
-    devs.reserve(contexts->size());
-
-    for (auto &p : places) {
-      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
-    }
-
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
-
-    int i = 0;
-    for (auto &dev_id : devs) {
-      contexts->at(dev_id).comm_ = comms[i++];
-    }
-  }
 };
 
 struct NCCLContextMap {
@@ -107,12 +86,12 @@ struct NCCLContextMap {
         "NCCL Context Map does not support contain two or more same device");
 
     if (places.size() > 1) {
-      std::vector<ncclComm_t> comms;
-      comms.resize(order_.size());
-
-      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-          &comms[0], static_cast<int>(order_.size()), &order_[0]));
-
+      std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+      {
+        std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
+        PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+            comms.get(), static_cast<int>(order_.size()), order_.data()));
+      }
       int i = 0;
       for (auto &dev_id : order_) {
         contexts_.at(dev_id).comm_ = comms[i++];
@@ -120,6 +99,9 @@ struct NCCLContextMap {
     }
   }
 
+  NCCLContextMap(const NCCLContextMap &other) = delete;
+  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
+
   CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
   CUDADeviceContext *DevCtx(platform::Place p) const {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a1e8ff6399f0812773a7bb753c90e4400b1763d9..19bd30d9665dc1e8f9d475868cabbf14c8847352 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
                                   scope, local_scopes, allow_op_delay);
            })
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
            },
            py::return_value_policy::reference)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
       .def("run", &ParallelExecutor::Run);
 
   BindRecordIOWriter(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4a9dbd324c90380e784cc9457845fabd858585be..159d1d5f4e70033fabf93514bd63b38f83675bff 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
       static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <>
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
   paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                    sizeof(uint16_t) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <typename T>
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index be1565ab533037d4bc72b6d2834c48b04638c297..2b2a904974f3756576fb47851400e344c9357c57 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -198,7 +198,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f757411b853bacb9e03fc42fa2ef6593c3cde00f..e9ca0d45f98bd27692a15060310d4e8cd1e8b181 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -37,6 +37,7 @@ from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
+from inference_transpiler import InferenceTranspiler
 import clip
 from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'clip',
     'SimpleDistributeTranspiler',
     'DistributeTranspiler',
+    'InferenceTranspiler',
     'memory_optimize',
     'release_memory',
     'profiler',
diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/inference_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..39b01610f96018e1775405a30147e77006cecc16
--- /dev/null
+++ b/python/paddle/fluid/inference_transpiler.py
@@ -0,0 +1,240 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from framework import Program
+from executor import global_scope
+from . import core
+
+
+class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        '''
+        Transpile the program. Support only fuse batch normalization now.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope or None
+        '''
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core.Scope):
+            raise TypeError("scope should be as Scope type or None")
+        self.fuse_batch_norm(program, place, scope)
+
+    def fuse_batch_norm(self, program, place, scope):
+        '''
+        Transpile the program by fused batch normalization.
+ 
+        The batch normalization followed the convolution or fully connected layer 
+        can be integrated with them. Doing so will give us a forward acceleration, 
+        especially in environments like mobile or embedded.
+                    
+        For input X:
+        - Conv process:        X = input * W + bias 
+        - Batch norm process:  X' = (X - mean) / std 
+        - Scale Process:       Y = a * X' + b
+
+        After fuse into one operation:
+
+        Y = (input * W + bias - mean) / std * a + b
+          = input * a * W / std + ((bias - mean) / std * a + b)
+
+        The operator transformation is: 
+        - before:
+          - conv->batch_norm->any_other_op (bias == 0)
+          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
+        - after: 
+          - conv->elementwise_add->any_other_op
+        
+        The transpile stages are:
+        1. insert elementwise_add op when bias == 0.
+        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
+        3. remove batch_norm ops which are not used in any other ops.
+        4. adjust the input of any_other_op to be the output of elementwise_add operator.
+        5. remove unused variables.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope
+        '''
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted 
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            # TODO(luotao1): consider only conv2d now. fc would be delt later.
+            if current_op.type in ['conv2d']:
+                # TODO(luotao1): consider single chain network now. 
+                # For branch network, we counldn't use block.ops[i + 1] as 
+                # the judgment condition.
+                next_op = self.block.ops[i + 1]
+                # conv2d without bias
+                if (next_op.type == 'batch_norm'):
+                    # insert bias op
+                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
+                    # fuse batch_norm
+                    self._fuse_param(current_op, next_op, bias_op, 0)
+                    # remove batch_norm_op
+                    self.block.remove_op(i + 2)
+                    i = i + 1
+                # conv2d with bias, the next_op.type is elementwise_add
+                elif (next_op.type == 'elementwise_add'):
+                    next_next_op = self.block.ops[i + 2]
+                    if (next_next_op.type == 'batch_norm'):
+                        # fuse batch_norm
+                        self._fuse_param(current_op, next_next_op, next_op, 1)
+                        # remove batch_norm_op
+                        self.block.remove_op(i + 2)
+                        i = i + 1
+            i = i + 1
+
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force, 
+        # since some large program.desc will not be flushed immediately. 
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _insert_bias_op(self, index, current_op, bn_op):
+        '''
+        Construct elementwise_add operator for adding bias 
+        and insert it into program.
+        
+        :param index: insert location of bias_op
+        :type index: Int
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :return: bias_op
+        :rtype: Operator
+        '''
+        # The input of bias_op is current_op's output and Bias of bn_op
+        # The output of bias_op is bn_op's output
+        x_var = self.block.var(current_op.output("Output")[0])
+        y_var = self.block.var(bn_op.input("Bias")[0])
+        out_var = self.block.var(bn_op.output("Y")[0])
+
+        bias_op = self.block.insert_op(
+            index,
+            type="elementwise_add",
+            inputs={"X": x_var,
+                    "Y": y_var},
+            outputs={"Out": out_var},
+            attrs={"axis": 1})  # dim_start=1
+        return bias_op
+
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        '''
+        fuse the batch_norm_op' parameters to current_op (conv or fc)
+        
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :param bias_op: elementwise_add operator for adding bias
+        :type bias_op: Operator
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
+        :type with_bias: Int
+        '''
+
+        def _update_param(op, old_param_name, new_param):
+            # For the sake of remaining the original variables the same as before,
+            # create new variables in scope to store the new parameters.
+            old_param_name = old_param_name[0]
+            old_var = self.block.vars[old_param_name]
+            new_param_name = old_param_name + '_fuse_bn'
+            new_var = self.block.create_parameter(
+                name=new_param_name.encode('ascii'),
+                type=old_var.type,
+                dtype=old_var.dtype,
+                shape=old_var.shape)
+            op.rename_input(old_param_name, new_param_name)
+            self.scope.var(new_param_name)
+
+            tensor = self.scope.find_var(new_param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name[0]).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
+        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
+        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
+        var_bn = _load_param(bn_op.input("Variance"))  #Variance
+
+        # TODO(luotao1): consider only conv2d now. fc would be delt later.
+        current_param = _load_param(current_op.input("Filter"))
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        _update_param(current_op, current_op.input("Filter"), dst_param)
+        _update_param(bias_op, bias_op.input("Y"), bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
+
+    def _adjust_input(self):
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op.rename_input(input_arg,
+                                            self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in self.block.vars.keys():
+            if var not in args:
+                self.block.remove_var(var)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b9a53eda9144e9e56cf9bc626db40cf4225bd87f..4b707973e27391a6bdcba138934f62a255e04bb2 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -32,7 +32,6 @@ __all__ = [
     'Switch',
     'lod_rank_table',
     'max_sequence_len',
-    'topk',
     'lod_tensor_to_array',
     'array_to_lod_tensor',
     'increment',
@@ -751,43 +750,6 @@ def max_sequence_len(rank_table):
     return res
 
 
-def topk(input, k):
-    """
-    **topk**
-
-    This function performs the operation that selects the k entries in the input
-    vector and outputs their values and indices as vectors. Thus topk_out[j] is
-    the j-th largest entry in input, and its index is topk_indices[j]
-
-    Args:
-        input (Variable|list): The input tensor that has all the data.
-        k (int): The number of top elements that the function will pick.
-
-    Returns:
-        Variable: The variable of type array that contains the k largest entries
-                  from input.
-        Variable: The variable of type array that contains the indices of k
-                  largest entries from input.
-
-    Examples:
-        .. code-block:: python
-
-          x = fluid.layers.data(name='x', shape=[10])
-          k = 5
-          array = fluid.layers.topk(x, k)
-    """
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
-
-
 def lod_tensor_to_array(x, table):
     """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index e7d6c4e2521bee133c4794ed1db669b02fc2152b..ead57ac370d1bec13c1b21e83dd4be1a7331f87e 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
 from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
@@ -158,6 +158,7 @@ class ListenAndServ(object):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
+        empty_block = Program().global_block()
 
         parent_block.append_op(
             type='listen_and_serv',
@@ -166,11 +167,12 @@ class ListenAndServ(object):
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'OptimizeBlock': current_block
+                'OptimizeBlock': current_block,
+                'PrefetchBlock': empty_block
             })
 
 
-def Send(endpoints, send_vars, get_vars):
+def Send(endpoints, send_vars, get_vars=None):
     """
     Send layer
 
@@ -184,7 +186,6 @@ def Send(endpoints, send_vars, get_vars):
     side when server have finished running server side program.
     """
     assert (type(send_vars) == list)
-    assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
@@ -192,6 +193,11 @@ def Send(endpoints, send_vars, get_vars):
     helper = LayerHelper("Send", **locals())
     rpc_client_var = default_main_program().global_block().create_var(
         name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
+    if not get_vars:
+        get_vars = []
+        for s in send_vars:
+            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
+            get_vars.append(v)
 
     helper.append_op(
         type="send",
@@ -200,6 +206,7 @@ def Send(endpoints, send_vars, get_vars):
                  "RPCClient": rpc_client_var},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    return get_vars
 
 
 def Recv(endpoints, get_vars):
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 65b95a58d6546ed6d6b264443a7c802e16eef23f..d13c54daa5a985e2e1bf9357630fe29d24a17bb4 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
 """
 When training a model, it's often useful to decay the
@@ -32,14 +32,41 @@ strategy according to this module.
 """
 
 
-def _decay_step_counter():
+def _decay_step_counter(begin=0):
     # the first global step is zero in learning rate decay
     global_step = nn.autoincreased_step_counter(
-        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     global_step = tensor.cast(global_step, 'float32')
     return global_step
 
 
+def noam_decay(d_model, warmup_steps):
+    """Apply decay to learning rate.
+    ```python
+    lr_value = np.power(d_model, -0.5) * np.min([
+            np.power(current_steps, -0.5),
+            np.power(warmup_steps, -1.5) * current_steps
+        ])
+    ```
+
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+            Reference: attention is all you need
+                https://arxiv.org/pdf/1706.03762.pdf
+        warmup_steps(Variable): A super parameter.
+
+    Returns:
+        The decayed learning rate.
+    """
+    global_step = _decay_step_counter(1)
+    with init_on_cpu():
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+
+    return lr_value
+
+
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """Applies exponential decay to the learning rate.
 
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index f66dccfa2d040ea0a9d29daeaa1d2da640525959..cab2eb55510542bdd4dd7eca7667601697759181 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
+import nn
 
 __all__ = ['accuracy', 'auc']
 
@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
     """
     This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
+    The output is the top k inputs and their indices.
     """
     helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
+    topk_out, topk_indices = nn.topk(input, k=k)
     acc_out = helper.create_tmp_variable(dtype="float32")
     if correct is None:
         correct = helper.create_tmp_variable(dtype="int64")
@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     helper = LayerHelper("auc", **locals())
     topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
+    topk_out, topk_indices = nn.topk(input, k=k)
     auc_out = helper.create_tmp_variable(dtype="float32")
     if correct is None:
         correct = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bba8b64bd88c3edc6eda110dde38c0ced50439f6..752f4689befd791da5c5c9626ffec3331f448f41 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -60,6 +60,7 @@ __all__ = [
     'edit_distance',
     'l2_normalize',
     'matmul',
+    'topk',
     'warpctc',
     'sequence_reshape',
     'transpose',
@@ -88,6 +89,7 @@ def fc(input,
        bias_attr=None,
        use_mkldnn=False,
        act=None,
+       is_test=False,
        name=None):
     """
     **Fully Connected Layer**
@@ -134,6 +136,7 @@ def fc(input,
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to None, no bias will be added to the output units.
         act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
             library is installed. Default: False
         name (str, default None): The name of this layer.
@@ -177,8 +180,11 @@ def fc(input,
             inputs={"Input": input,
                     "W": w},
             outputs={"Out": tmp},
-            attrs={"use_mkldnn": use_mkldnn,
-                   "bias_attr": bias_attr})
+            attrs={
+                "use_mkldnn": use_mkldnn,
+                "is_test": is_test,
+                "bias_attr": bias_attr
+            })
         return helper.append_activation(tmp)
     else:
         for input_var, param_attr in helper.iter_inputs_and_params():
@@ -2571,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
+def topk(input, k):
+    """
+    This operator is used to find values and indices of the k largest entries
+    for the last dimension.
+
+    If the input is a vector (rank=1), finds the k largest entries in the vector
+    and outputs their values and indices as vectors. Thus values[j] is the j-th
+    largest entry in input, and its index is indices[j].
+
+    If the input is a Tensor with higher rank, this operator computes the top k
+    entries along the last dimension.
+
+    Args:
+        input(Variable): The input variable which can be a vector or Tensor with
+            higher rank.
+        k(int): An integer value to specify the top k largest elements.
+
+    Returns:
+        values(Variable): The k largest elements along each last dimensional
+            slice.
+        indices(Variable): The indices of values within the last dimension of
+            input.
+
+    Examples:
+        .. code-block:: python
+
+            top5_values, top5_indices = layers.topk(input, k=5)
+    """
+    shape = input.shape
+    if k < 1 and k >= shape[-1]:
+        raise ValueError("k must be greater than 0 and less than %d." %
+                         (shape[-1]))
+
+    helper = LayerHelper("top_k", **locals())
+    values = helper.create_tmp_variable(dtype=input.dtype)
+    indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs={"k": k})
+    values.stop_gradient = True
+    indices.stop_gradient = True
+    return values, indices
+
+
 def edit_distance(input, label, normalized=True, ignored_tokens=None,
                   name=None):
     """
@@ -2712,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None):
             cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
     """
     helper = LayerHelper("ctc_greedy_decoder", **locals())
-    # top 1 op
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": 1})
+    _, topk_indices = topk(input, k=1)
 
     # ctc align op
     ctc_out = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5ce2aa1fc4d0b275b502af0f97e4a0f83e85de5b..fbdd6fd449625a21f91758dc12490b02070aea1a 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,6 +16,8 @@ import core
 import multiprocessing
 import framework
 import executor
+import warnings
+import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -61,8 +63,8 @@ class ParallelExecutor(object):
                   main_program=test_program,
                   share_vars_from=train_exe)
 
-              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+              train_loss, = train_exe.run([loss.name], feed=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed=feed_dict)
         """
 
         self._places = []
@@ -102,8 +104,8 @@ class ParallelExecutor(object):
 
         self.persistable_vars = [
             v.name
-            for v in filter(lambda var: \
-                var.persistable and var.type != core.VarDesc.VarType.RAW,
+            for v in filter(
+                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
                 main.list_vars())
         ]
 
@@ -123,28 +125,93 @@ class ParallelExecutor(object):
             allow_op_delay)
         self.scope = scope
 
-    def run(self, fetch_list, feed_dict={}):
+    def run(self, fetch_list, feed=None, feed_dict=None):
         """
-        :param fetch_list: A list of variable names that will be fetched.
-        :param feed_dict: A dict mapping for feed variable name to LoDTensor
-          or numpy array.
-        :return: fetched value list.
-        """
-        if not isinstance(feed_dict, dict):
-            raise TypeError("feed_dict should be a dict")
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])
+
+
+        Args:
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter is deprecated.
 
-        feed_tensor_dict = {}
-        for i, feed_name in enumerate(feed_dict):
-            feed_tensor = feed_dict[feed_name]
-            if not isinstance(feed_tensor, core.LoDTensor):
-                feed_tensor = core.LoDTensor()
-                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
-            feed_tensor_dict[feed_name] = feed_tensor
+        Returns: fetched result list.
+
+        """
+        if feed is None and feed_dict is not None:
+            feed = feed_dict
+            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
 
         fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
+        self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
         self.executor.bcast_params(set(self.persistable_vars))
+
+    @property
+    def device_count(self):
+        return len(self._act_places)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index e8bb082be196b6342b1719235f1264bbe3d776ac..0027b651e88b68950e77e03399b3987aa0120192 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -22,10 +22,17 @@ import sys
 import numpy
 import unittest
 import os
+import numpy as np
 
 
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
         tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
@@ -33,7 +40,7 @@ def resnet_cifar10(input, depth=32):
             stride=stride,
             padding=padding,
             act=None,
-            bias_attr=False)
+            bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -44,7 +51,7 @@ def resnet_cifar10(input, depth=32):
 
     def basicblock(input, ch_in, ch_out, stride):
         tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
         short = shortcut(input, ch_in, ch_out, stride)
         return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
@@ -219,11 +226,26 @@ def infer(use_cuda, save_dirname=None):
         batch_size = 1
         tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
 
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
+
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=6)
+
         print("infer results: ", results[0])
 
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 356c3e64b3d03b520a1bec5b5e0174e1d8ee23e8..d9190408e151283ece8460286dd67818dd39da3e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,10 +1,13 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one kernel (MKLDNN)
+# The MKLDNN tests are skiped when the MKLDNN flag is OFF
 if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM TEST_OPS test_fc_op)
+    foreach(src ${TEST_OPS})
+        if(${src} MATCHES ".*_mkldnn_op$")
+            list(REMOVE_ITEM TEST_OPS ${src})
+        endif()
+    endforeach()
 endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
@@ -62,6 +65,7 @@ list(REMOVE_ITEM TEST_OPS test_registry)
 list(REMOVE_ITEM TEST_OPS test_fetch_var)
 list(REMOVE_ITEM TEST_OPS test_parallel_op)
 list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
+list(REMOVE_ITEM TEST_OPS test_dist_train)
 
 # tests that can be bundled together in one python process for speed.
 if(WITH_FAST_BUNDLE_TEST)
@@ -100,3 +104,4 @@ py_test_modules(test_registry MODULES test_registry)
 py_test_modules(test_fetch_var MODULES test_fetch_var)
 py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
 py_test_modules(test_parallel_op MODULES test_parallel_op)
+py_test_modules(test_dist_train MODULES test_dist_train)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d554c2276c9acd710d14c8f8b32c802e3e17515
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit
+from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+
+
+class TestMKLDNNReluDim2(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNReluDim4(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim4(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim4(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 57d4a50e913c0d2994c62600f4e479056ed4c306..c9069777faf9d141db93184e8b1e6dc2a7034980 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1098,82 +1098,5 @@ class TestFP16Swish(TestSwish):
                 self.check_output_with_place(place, atol=1e-3)
 
 
-#--------------------test MKLDNN--------------------
-class TestMKLDNNReluDim2(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim2(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim2(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNReluDim4(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim4(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim4(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim4(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6be21baaa54d33af9f5c44d1815e4b389eb884
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 65606a0b4373b28036096cf046da5143a3b8bcd0..a478649541ba9828e55c4239090d5aee554223ac 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -373,22 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
-
-#----------------Conv2dMKLDNN----------------
-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
similarity index 57%
rename from python/paddle/fluid/tests/unittests/test_recv_op.py
rename to python/paddle/fluid/tests/unittests/test_dist_train.py
index 2ebceca7e4b7b824194d94180462870e6cfe6d21..c7fdd06f105e3b5fd906d3524d41df8f84160e63 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -15,31 +15,42 @@
 import unittest
 
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 import numpy
 from multiprocessing import Process
+from threading import Thread
 import os, sys
 import time
 
 
-class TestRecvOp(unittest.TestCase):
-    def no_test_send(self):
+class TestSendOp(unittest.TestCase):
+    def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
+        # NOTE: python thread will not work here due to GIL.
         p = Process(target=self.init_serv, args=(place, ))
         p.daemon = True
         p.start()
-        time.sleep(1)
-        self.init_client(place)
+
+        time.sleep(10)
+        with open("/tmp/paddle.selected_port", "r") as fn:
+            selected_port = int(fn.readlines()[0])
+        self.init_client(place, selected_port)
+
+        self.run_local(place)
+        self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
+
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
         os.system("kill -9 %d" % p.pid)
         p.join()
 
     def init_serv(self, place):
         main = fluid.Program()
+
         with fluid.program_guard(main):
             serv = layers.ListenAndServ(
-                "127.0.0.1:6174", ["X"], optimizer_mode=False)
+                "127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
                 x = layers.data(
                     shape=[32, 32],
@@ -50,10 +61,29 @@ class TestRecvOp(unittest.TestCase):
                 o = layers.scale(x=x, scale=10.0)
             main.global_block().create_var(
                 name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
+
+        self.server_exe = fluid.Executor(place)
+        self.server_exe.run(main)
+
+    def init_client(self, place, port):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            get_var = main.global_block().create_var(
+                name="scale_0.tmp_0",  # server side var
+                dtype="float32",
+                persistable=False,
+                shape=[32, 32])
+            o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
         exe = fluid.Executor(place)
-        exe.run(main)
+        self.dist_out = exe.run(main, fetch_list=o)  # o is a list
 
-    def init_client(self, place):
+    def run_local(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
             x = layers.data(
@@ -61,10 +91,10 @@ class TestRecvOp(unittest.TestCase):
                 dtype='float32',
                 name='X',
                 append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            layers.Send("127.0.0.1:6174", [x], [x])
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            o = layers.scale(x=x, scale=10.0)
         exe = fluid.Executor(place)
-        exe.run(main)
+        self.local_out = exe.run(main, fetch_list=[o])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fc_op.py
rename to python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a1be2d671ddc5c689b16319fcf5bf12dca5dde7e..17d6afdee161426e5da398ffa2ec148a027c905e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -350,6 +350,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(smooth_label)
         print(str(program))
 
+    def test_topk(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name="label", shape=[200], dtype="float32")
+            values, indices = layers.topk(data, k=5)
+            self.assertIsNotNone(values)
+            self.assertIsNotNone(indices)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a16dc870c041b9deb140bed57d907cf305fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_lrn_op import TestLRNOp
+
+
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 8fa480b9bce84d2936f23cce9e41e8e54014b074..eaff45cbb2a58798e9d55149510bec72eea370cd 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,34 +87,5 @@ class TestLRNOp(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
-class TestLRNMKLDNNOp(TestLRNOp):
-    def get_attrs(self):
-        attrs = TestLRNOp.get_attrs(self)
-        attrs['use_mkldnn'] = True
-        return attrs
-
-    def test_check_output(self):
-        self.check_output(atol=0.002)
-
-
-class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
-    def get_attrs(self):
-        attrs = TestLRNMKLDNNOp.get_attrs(self)
-        attrs['is_test'] = True
-        return attrs
-
-    def test_check_grad_normal(self):
-        def check_raise_is_test():
-            try:
-                self.check_grad(['X'], 'Out', max_relative_error=0.01)
-            except Exception as e:
-                t = \
-                "is_test attribute should be set to False in training phase."
-                if t in str(e):
-                    raise AttributeError
-
-        self.assertRaises(AttributeError, check_raise_is_test)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 83d22fd799eea55eedb58f93421b275985edb50b..c783a142467f3f6a9cd210425acfc526a32a6f71 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -200,34 +200,56 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(self,
                                   method,
                                   memory_opt=True,
-                                  iter=10,
+                                  iter=50,
                                   batch_size=None,
                                   allow_op_delay=False,
-                                  feed_dict={}):
+                                  feed_dict=None,
+                                  seed=None,
+                                  use_parallel_executor=True):
+        def run_executor(exe, feed, fetch_list, program=None):
+            if isinstance(exe, fluid.ParallelExecutor):
+                res = exe.run(fetch_list=fetch_list, feed=feed)
+            elif isinstance(exe, fluid.Executor):
+                if program is None:
+                    program = fluid.default_main_program()
+                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
+            else:
+                raise ValueError('Unkown type exe')
+            return res
+
         main = fluid.Program()
         startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
         with fluid.program_guard(main, startup):
-            loss = method(use_feed=len(feed_dict) > 0)
+            if seed is not None:
+                startup.random_seed = seed
+            loss = method(use_feed=feed_dict is not None)
             adam = fluid.optimizer.Adam()
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
-
             place = fluid.CUDAPlace(0)
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
 
-            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    True, loss_name=loss.name, allow_op_delay=allow_op_delay)
+            else:
+                exe = fluid.Executor(place=place)
+
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
-            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            first_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
             first_loss = numpy.array(first_loss)
 
             for i in xrange(iter):
-                exe.run([], feed_dict=feed_dict)
+                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
 
-            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            last_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
             end = time.time()
 
             if batch_size is not None:
@@ -238,6 +260,7 @@ class TestParallelExecutorBase(unittest.TestCase):
 
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
+            return first_loss, last_loss
 
 
 class TestMNIST(TestParallelExecutorBase):
@@ -267,6 +290,27 @@ class TestMNIST(TestParallelExecutorBase):
             simple_fc_net, feed_dict={"image": img,
                                       "label": label})
 
+    def test_simple_fc_parallel_accuracy(self):
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=True)
+
+        for p_f in parallel_first_loss:
+            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
+        for p_l in parallel_last_loss:
+            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+
     def test_batchnorm_fc(self):
         self.check_network_convergence(fc_with_batchnorm)
         img = numpy.zeros(shape=[32, 784], dtype='float32')
@@ -495,10 +539,10 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                 share_vars_from=train_exe)
 
             for i in xrange(5):
-                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
                 test_loss = numpy.array(test_loss)
 
-                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
                 train_loss = numpy.array(train_loss)
                 self.assertTrue(
                     numpy.allclose(
@@ -648,5 +692,5 @@ class TestCRFModel(unittest.TestCase):
             for i in xrange(10):
                 cur_batch = next(data)
                 print map(numpy.array,
-                          pe.run(feed_dict=feeder.feed(cur_batch),
+                          pe.run(feed=feeder.feed(cur_batch),
                                  fetch_list=[avg_cost.name]))[0]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..003ebba18b26198427d9f313596ae85656ac24fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+
+
+class TestMKLDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 764fa575fba1615de3171e848890b3836e640849..328a9ffd25b9fce3fd45bbe847e365f090acd17c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -317,36 +317,5 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
-#--------------------test pool2d MKLDNN--------------------
-class TestMKLDNNCase1(TestPool2d_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
index b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc..7eb040902b0f8f3cc9f7a31ec9f96467de654c3e 100644
--- a/tools/manylinux1/Dockerfile.android
+++ b/tools/manylinux1/Dockerfile.android
@@ -37,7 +37,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit